From 34cfc3961f96c4252ca395f3d0f48fa7cac98bb3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 12 Jan 2026 23:08:48 +0100 Subject: [PATCH 1/2] AMDGPU: Move softPromoteHalfType override to R600 only As expected the code is much worse, but more correct. We could do a better job with source modifier management around fp16_to_fp/fp_to_fp16. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 - llvm/lib/Target/AMDGPU/R600ISelLowering.h | 2 + .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 40327 +++++++--------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 2814 +- .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll | 1290 +- .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 123 +- .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll | 2452 +- .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll | 1750 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 5710 +-- .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll | 2304 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 6520 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 668 +- .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll | 2728 +- .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 4957 +- .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 5627 +-- .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 340 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 14491 +++--- .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 8360 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 9829 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 1434 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 11290 ++--- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 13055 ++--- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 14580 +++--- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 16063 +++--- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 18023 +++---- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 1660 +- .../atomic_optimizations_global_pointer.ll | 70 +- llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 823 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 407 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 407 +- .../CodeGen/AMDGPU/calling-conventions.ll | 32 +- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 175 +- llvm/test/CodeGen/AMDGPU/clamp.ll | 174 +- .../AMDGPU/constant-address-space-32bit.ll | 586 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 53 +- .../AMDGPU/divergence-driven-buildvector.ll | 2 +- .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 294 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 111 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 16 +- .../CodeGen/AMDGPU/fadd-fma-fmul-combine.ll | 12 + llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 38 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 326 +- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 252 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 28 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 2345 +- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 44 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 598 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 436 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 436 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 436 +- llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll | 228 +- llvm/test/CodeGen/AMDGPU/fmax3.ll | 68 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 297 +- .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 495 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 97 +- llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll | 228 +- llvm/test/CodeGen/AMDGPU/fmin3.ll | 68 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 297 +- llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 82 +- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 2179 +- llvm/test/CodeGen/AMDGPU/fneg-combines.ll | 15 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 38 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 1 - .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 40 +- llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 14 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 37 +- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 26 +- llvm/test/CodeGen/AMDGPU/fpow.ll | 22 +- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 248 +- llvm/test/CodeGen/AMDGPU/freeze.ll | 52 +- llvm/test/CodeGen/AMDGPU/frem.ll | 2230 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 1536 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 936 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 936 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 936 +- llvm/test/CodeGen/AMDGPU/half.ll | 559 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 243 +- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 14 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 106 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 116 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 58 +- llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll | 1084 +- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 85 +- llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 60 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 103 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 103 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 81 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 534 +- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 120 +- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 128 +- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 14 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 292 +- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 292 +- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 292 +- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 292 +- llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 44 +- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 318 +- llvm/test/CodeGen/AMDGPU/mad-mix.ll | 426 +- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 859 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 834 +- llvm/test/CodeGen/AMDGPU/omod.ll | 12 +- llvm/test/CodeGen/AMDGPU/repeated-divisor.ll | 184 +- llvm/test/CodeGen/AMDGPU/roundeven.ll | 28 +- .../AMDGPU/select-fabs-fneg-extract.f16.ll | 485 +- .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 1018 +- .../AMDGPU/select-flags-to-fmin-fmax.ll | 554 +- llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll | 20 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 861 +- llvm/test/CodeGen/AMDGPU/strict_fpext.ll | 41 +- llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll | 41 +- .../AMDGPU/strictfp_f16_abi_promote.ll | 90 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 5 +- llvm/test/CodeGen/AMDGPU/v_mac.ll | 12 +- llvm/test/CodeGen/AMDGPU/v_mac_f16.ll | 279 +- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 34 +- .../test/CodeGen/AMDGPU/vector-reduce-fadd.ll | 200 +- .../test/CodeGen/AMDGPU/vector-reduce-fmax.ll | 191 +- .../CodeGen/AMDGPU/vector-reduce-fmaximum.ll | 268 +- .../test/CodeGen/AMDGPU/vector-reduce-fmin.ll | 191 +- .../CodeGen/AMDGPU/vector-reduce-fminimum.ll | 268 +- .../test/CodeGen/AMDGPU/vector-reduce-fmul.ll | 200 +- 123 files changed, 94614 insertions(+), 123079 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index dde84e2090b90..2f8777fffdc92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -405,8 +405,6 @@ class AMDGPUTargetLowering : public TargetLowering { // are using vector compares until that is fixed. return true; } - - bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 661efb8684813..bb7fc46a98cbd 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -117,6 +117,8 @@ class R600TargetLowering final : public AMDGPUTargetLowering { TargetLowering::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override; + + bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 2ce67c3848bae..a62a1828a6e93 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -30728,6 +30728,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -30744,650 +30746,219 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -31404,12 +30975,44 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31585,197 +31188,128 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-LABEL: bitcast_v32i32_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 ; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 ; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 ; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 ; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v32, s51, 11 ; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v32, s52, 12 ; SI-NEXT: v_readfirstlane_b32 s45, v19 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 ; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: v_readfirstlane_b32 s19, v4 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_readfirstlane_b32 s16, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s10, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 ; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -31790,335 +31324,233 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s56, s42, 16 -; SI-NEXT: s_lshr_b32 s57, s43, 16 -; SI-NEXT: s_lshr_b32 s58, s44, 16 -; SI-NEXT: s_lshr_b32 s59, s45, 16 -; SI-NEXT: s_lshr_b32 s60, s46, 16 -; SI-NEXT: s_lshr_b32 s61, s47, 16 -; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s25, 16 -; SI-NEXT: s_lshr_b32 s72, s26, 16 -; SI-NEXT: s_lshr_b32 s73, s27, 16 -; SI-NEXT: s_lshr_b32 s74, s28, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s22, 16 -; SI-NEXT: s_lshr_b32 s78, s21, 16 -; SI-NEXT: s_lshr_b32 s79, s20, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s18, 16 -; SI-NEXT: s_lshr_b32 s90, s17, 16 -; SI-NEXT: s_lshr_b32 s91, s16, 16 -; SI-NEXT: s_lshr_b32 s92, s15, 16 -; SI-NEXT: s_lshr_b32 s93, s14, 16 -; SI-NEXT: s_lshr_b32 s94, s13, 16 -; SI-NEXT: s_lshr_b32 s95, s12, 16 -; SI-NEXT: s_lshr_b32 vcc_lo, s11, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s10, 16 -; SI-NEXT: s_lshr_b32 s30, s8, 16 -; SI-NEXT: s_lshr_b32 s31, s7, 16 -; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v10, v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v12, v30, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v16, v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v18, v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_or_b32_e32 v20, v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v15, v40, v15 -; SI-NEXT: v_or_b32_e32 v24, v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v26, v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v17, v54, v17 -; SI-NEXT: v_or_b32_e32 v19, v52, v19 -; SI-NEXT: v_or_b32_e32 v21, v50, v21 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s56, s60, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v32i32_to_v64f16_scalar: @@ -32381,342 +31813,208 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -32728,429 +32026,474 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB22_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -33413,8 +32756,6 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -33431,696 +32772,396 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB23_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB23_3 -; SI-NEXT: .LBB23_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB23_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB23_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -34133,33 +33174,25 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -34167,14 +33200,21 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -34191,8 +33231,27 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v64f16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -67607,6 +66666,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -67623,650 +66684,219 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -68283,12 +66913,44 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -68449,6 +67111,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -68465,542 +67142,259 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, s16 -; SI-NEXT: v_mov_b32_e32 v40, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v41, s18 -; SI-NEXT: v_mov_b32_e32 v55, s19 -; SI-NEXT: v_mov_b32_e32 v54, s20 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s22 -; SI-NEXT: v_mov_b32_e32 v51, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v39, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v55 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v56 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 +; SI-NEXT: v_mov_b32_e32 v10, v54 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v13, v41 +; SI-NEXT: v_mov_b32_e32 v14, v32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -69017,112 +67411,55 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v17, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -69369,342 +67706,208 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -69716,429 +67919,474 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -70401,8 +68649,6 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-LABEL: bitcast_v64f16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -70419,696 +68665,396 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB47_3 -; SI-NEXT: .LBB47_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB47_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB47_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -71121,33 +69067,25 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -71155,14 +69093,21 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -71179,8 +69124,27 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v64f16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -102726,6 +100690,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -102742,290 +100708,91 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 @@ -103044,349 +100811,116 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -103403,12 +100937,44 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -103592,540 +101158,369 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-LABEL: bitcast_v16i64_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 ; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 ; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 ; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v19 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 ; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: v_readfirstlane_b32 s23, v1 -; SI-NEXT: v_readfirstlane_b32 s20, v2 -; SI-NEXT: v_readfirstlane_b32 s21, v3 -; SI-NEXT: v_readfirstlane_b32 s18, v4 -; SI-NEXT: v_readfirstlane_b32 s19, v5 -; SI-NEXT: v_readfirstlane_b32 s16, v6 -; SI-NEXT: v_readfirstlane_b32 s17, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s56, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s44, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s44, s45, 0 -; SI-NEXT: s_lshr_b32 s45, s41, 16 -; SI-NEXT: s_lshr_b32 s57, s44, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s47, s47, 0 -; SI-NEXT: s_lshr_b32 s60, s43, 16 -; SI-NEXT: s_lshr_b32 s61, s47, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s56, s56, 0 -; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s56, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s74, s26, 16 -; SI-NEXT: s_lshr_b32 s75, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s76, s22, 16 -; SI-NEXT: s_lshr_b32 s77, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s78, s20, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s88, s18, 16 -; SI-NEXT: s_lshr_b32 s89, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s90, s16, 16 -; SI-NEXT: s_lshr_b32 s91, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s92, s14, 16 -; SI-NEXT: s_lshr_b32 s93, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s94, s12, 16 -; SI-NEXT: s_lshr_b32 s95, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 vcc_lo, s10, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s30, s7, 16 -; SI-NEXT: s_lshr_b32 s31, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 -; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_add_u32 s56, s56, 3 +; SI-NEXT: s_addc_u32 s57, s57, 0 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v10, v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v12, v30, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v16, v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v18, v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_or_b32_e32 v20, v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v15, v40, v15 -; SI-NEXT: v_or_b32_e32 v24, v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v26, v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v17, v54, v17 -; SI-NEXT: v_or_b32_e32 v19, v52, v19 -; SI-NEXT: v_or_b32_e32 v21, v50, v21 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s56, s60, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v16i64_to_v64f16_scalar: @@ -104396,342 +101791,208 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -104743,429 +102004,474 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -105428,8 +102734,6 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -105446,696 +102750,396 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB67_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB67_3 -; SI-NEXT: .LBB67_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB67_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB67_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -106148,33 +103152,25 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -106182,14 +103178,21 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -106206,8 +103209,27 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v64f16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -136955,532 +133977,202 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_add_f64 v[32:33], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v43 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -137497,91 +134189,47 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64f16: @@ -137709,21 +134357,21 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 ; SI-NEXT: v_mov_b32_e32 v26, s20 ; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v30, s22 -; SI-NEXT: v_mov_b32_e32 v31, s23 -; SI-NEXT: v_mov_b32_e32 v28, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v24, s26 -; SI-NEXT: v_mov_b32_e32 v25, s27 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v22, s28 -; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -137742,520 +134390,229 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB81_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB81_3 ; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f64 v[39:40], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[0:1], v[20:21], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: v_add_f64 v[0:1], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v5 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v35, v17 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 ; SI-NEXT: .LBB81_3: ; %end -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v35 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v10, v54 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v13, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -138272,111 +134629,57 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v17, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_branch .LBB81_2 ; ; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: @@ -138593,342 +134896,208 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -138940,429 +135109,474 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB82_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -139625,8 +135839,6 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-LABEL: bitcast_v64f16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -139643,696 +135855,396 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB83_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB83_3 -; SI-NEXT: .LBB83_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB83_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB83_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -140345,33 +136257,25 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -140379,14 +136283,21 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB83_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -140403,8 +136314,27 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB83_2 ; ; VI-LABEL: bitcast_v64f16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -169474,1790 +165404,1999 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v12 -; SI-NEXT: v_mov_b32_e32 v50, v0 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v61, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v60, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:184 -; SI-NEXT: v_mov_b32_e32 v44, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: v_mov_b32_e32 v60, v26 -; SI-NEXT: v_mov_b32_e32 v45, v20 -; SI-NEXT: v_mov_b32_e32 v56, v14 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v0 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v17, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v21, v0, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v15, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v57 -; SI-NEXT: v_or_b32_e32 v63, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v14, v0, v58 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v19 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v41 -; SI-NEXT: v_and_b32_e32 v33, 0xff, v53 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v6, v6, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v47 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v13, v0, v59 -; SI-NEXT: v_mov_b32_e32 v0, v61 -; SI-NEXT: v_mov_b32_e32 v61, v3 -; SI-NEXT: v_mov_b32_e32 v3, v23 -; SI-NEXT: v_or_b32_e32 v23, v18, v62 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v51, 0xff, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v48, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v58 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v24, v18, v27 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v26, v18, v29 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v54 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v51 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v19, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v25 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v60, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v18, v1 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v1, v9 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v11, v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v7, v1 +; SI-NEXT: v_or_b32_e32 v10, v1, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v1, v12 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v10, v10, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v1, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v1, v14 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v1, v16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v4, v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v49, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v20, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v27, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v33, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v29, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v34, v1 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v27, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v9, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v50, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v51, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v29, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v52, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v52, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v53, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v53, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v54, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v54, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v55, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v55, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v41, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v42, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v42, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v45, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v31, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v45, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v31, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v45, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v46, v56, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v46, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v31, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v48, v48, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v47, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v47, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v56, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v56, v56, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v57, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v57, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v58, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v59, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v60, v61 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v62, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v62, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v40, v40, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v60, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v47, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v61, v43, v59 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v43, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v59, v59, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v5 +; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v9, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v48, v9, v12 +; SI-NEXT: v_alignbit_b32 v9, v11, v12, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v12, v9, v17 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v12, v14, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v51, v9, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v15, v9, v16 +; SI-NEXT: v_alignbit_b32 v9, v51, v16, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v16, v13, v20 +; SI-NEXT: v_alignbit_b32 v13, v9, v20, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v31, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v13, v31, v22, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v13, v24, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v24, v14, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v14, v28 +; SI-NEXT: v_alignbit_b32 v14, v24, v28, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v21, v14, v54 +; SI-NEXT: v_or_b32_e32 v22, v6, v30 +; SI-NEXT: v_alignbit_b32 v6, v21, v30, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v6, v55 +; SI-NEXT: v_or_b32_e32 v29, v1, v34 +; SI-NEXT: v_alignbit_b32 v1, v19, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v20, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v23, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v30, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v27, v1, v38 +; SI-NEXT: v_alignbit_b32 v1, v30, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v28, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v34, v1, v45 +; SI-NEXT: v_alignbit_b32 v1, v28, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v33, v1, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v36, v1, v7 +; SI-NEXT: v_alignbit_b32 v1, v33, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v35, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v38, v1, v46 +; SI-NEXT: v_alignbit_b32 v1, v35, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v39, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v37, v3, 16 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB92_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_mov_b32_e32 v33, v46 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 -; SI-NEXT: v_mov_b32_e32 v63, v50 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v39 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 -; SI-NEXT: v_or_b32_e32 v63, v22, v63 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_or_b32_e32 v56, v58, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v52, v34, v52 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x300, v11 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v15, v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v10 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v58, v14, v58 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v17, v3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v21, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v35 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v46, v19, v46 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v57, v0, v57 -; SI-NEXT: v_and_b32_e32 v41, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v41, v29, v41 -; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v45, v62, v45 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 -; SI-NEXT: v_or_b32_e32 v62, v26, v62 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_or_b32_e32 v61, v5, v61 -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v23, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v47, v59, v47 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v59, v9, v59 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v33, v2, v33 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v2, v35 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v50 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v50 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v2, v51 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v2, v53 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v2, v54 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v54 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v2, v55 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v55 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v2, v49 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v49 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v2, v48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v48 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v2, v39 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v39 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v2, v38 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v38 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v4, v2, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v60, v20, v60 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v42, v27, v42 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v42 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v37, v10, v37 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v37 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v36, v10, v36 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v18 +; SI-NEXT: v_mov_b32_e32 v18, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v12, v10, v12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v32, v28, v32 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v43, v24, v43 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v40, v30, v40 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v44, v24, v44 -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v44 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v25, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_mov_b32_e32 v11, v25 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v31, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v32, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v20, v19 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v25 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v0, v11, v48, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v12, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v51, v15, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v9, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v31, v3, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v13, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v24, v5, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v21, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v19, v29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v23, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v30, v27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v28, v34, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v33, v36, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_mov_b32_e32 v15, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v35, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v37, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v63 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -171274,217 +167413,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_or_b32_e32 v31, v34, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64f16: @@ -175609,1586 +171553,1635 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v61, s29, 0 -; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: v_writelane_b32 v61, s26, 3 -; SI-NEXT: v_writelane_b32 v61, s25, 4 -; SI-NEXT: v_writelane_b32 v61, s24, 5 -; SI-NEXT: v_writelane_b32 v61, s23, 6 -; SI-NEXT: v_writelane_b32 v61, s22, 7 -; SI-NEXT: v_writelane_b32 v61, s21, 8 -; SI-NEXT: v_writelane_b32 v61, s20, 9 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_mov_b32 s61, s19 -; SI-NEXT: s_mov_b32 s62, s17 -; SI-NEXT: s_mov_b32 s73, s18 -; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: v_readfirstlane_b32 s35, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v27 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s41, v26 +; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s12, 0 -; SI-NEXT: v_readfirstlane_b32 s46, v29 -; SI-NEXT: v_writelane_b32 v62, s41, 1 -; SI-NEXT: v_readfirstlane_b32 s56, v28 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: v_writelane_b32 v62, s56, 3 -; SI-NEXT: v_readfirstlane_b32 s77, v30 -; SI-NEXT: v_readfirstlane_b32 s96, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_readfirstlane_b32 s38, v4 -; SI-NEXT: v_readfirstlane_b32 s94, v7 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_readfirstlane_b32 s91, v9 -; SI-NEXT: v_readfirstlane_b32 s98, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v11 -; SI-NEXT: v_readfirstlane_b32 s20, v10 -; SI-NEXT: v_readfirstlane_b32 s24, v13 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s78, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_readfirstlane_b32 s43, v21 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s88, v23 -; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: v_readfirstlane_b32 s28, v25 -; SI-NEXT: v_readfirstlane_b32 s7, v24 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s87, v1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_writelane_b32 v43, s29, 0 +; SI-NEXT: v_writelane_b32 v43, s28, 1 +; SI-NEXT: v_writelane_b32 v43, s27, 2 +; SI-NEXT: v_writelane_b32 v43, s26, 3 +; SI-NEXT: v_writelane_b32 v43, s25, 4 +; SI-NEXT: v_writelane_b32 v43, s24, 5 +; SI-NEXT: v_writelane_b32 v43, s23, 6 +; SI-NEXT: v_writelane_b32 v43, s22, 7 +; SI-NEXT: v_writelane_b32 v43, s21, 8 +; SI-NEXT: v_writelane_b32 v43, s20, 9 +; SI-NEXT: v_writelane_b32 v43, s19, 10 +; SI-NEXT: v_writelane_b32 v43, s18, 11 +; SI-NEXT: v_writelane_b32 v43, s17, 12 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: v_writelane_b32 v41, s80, 24 +; SI-NEXT: v_writelane_b32 v41, s81, 25 +; SI-NEXT: v_writelane_b32 v41, s82, 26 +; SI-NEXT: v_writelane_b32 v41, s83, 27 +; SI-NEXT: v_writelane_b32 v41, s84, 28 +; SI-NEXT: v_writelane_b32 v41, s85, 29 +; SI-NEXT: v_writelane_b32 v41, s86, 30 +; SI-NEXT: v_writelane_b32 v41, s87, 31 +; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: s_mov_b32 s22, s16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s57, v10 +; SI-NEXT: v_writelane_b32 v42, s56, 0 +; SI-NEXT: v_readfirstlane_b32 s49, v23 +; SI-NEXT: v_writelane_b32 v42, s57, 1 +; SI-NEXT: v_readfirstlane_b32 s50, v22 +; SI-NEXT: v_writelane_b32 v42, s49, 2 +; SI-NEXT: v_readfirstlane_b32 s51, v20 +; SI-NEXT: v_writelane_b32 v42, s50, 3 +; SI-NEXT: v_readfirstlane_b32 s52, v21 +; SI-NEXT: v_writelane_b32 v42, s51, 4 +; SI-NEXT: v_writelane_b32 v42, s52, 5 +; SI-NEXT: v_readfirstlane_b32 s58, v19 +; SI-NEXT: v_readfirstlane_b32 s59, v18 +; SI-NEXT: v_readfirstlane_b32 s64, v30 +; SI-NEXT: v_readfirstlane_b32 s65, v28 +; SI-NEXT: v_readfirstlane_b32 s66, v29 +; SI-NEXT: v_readfirstlane_b32 s60, v27 +; SI-NEXT: v_readfirstlane_b32 s61, v26 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 27 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s44, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s7, v38 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s76, v16 +; SI-NEXT: v_readfirstlane_b32 s77, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v3 +; SI-NEXT: v_readfirstlane_b32 s47, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v24 +; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s99, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s88, v40 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 35 -; SI-NEXT: v_writelane_b32 v61, s62, 36 -; SI-NEXT: v_writelane_b32 v61, s10, 37 -; SI-NEXT: v_writelane_b32 v61, s61, 38 -; SI-NEXT: v_writelane_b32 v61, s73, 39 -; SI-NEXT: v_writelane_b32 v61, s35, 40 -; SI-NEXT: v_writelane_b32 v61, s96, 41 -; SI-NEXT: v_writelane_b32 v61, s6, 42 -; SI-NEXT: v_writelane_b32 v61, s38, 43 -; SI-NEXT: v_writelane_b32 v61, s94, 44 -; SI-NEXT: v_writelane_b32 v61, s90, 45 -; SI-NEXT: v_writelane_b32 v61, s91, 46 -; SI-NEXT: v_writelane_b32 v61, s98, 47 -; SI-NEXT: v_writelane_b32 v61, s93, 48 -; SI-NEXT: v_writelane_b32 v61, s20, 49 -; SI-NEXT: v_writelane_b32 v61, s24, 50 -; SI-NEXT: v_writelane_b32 v61, s27, 51 -; SI-NEXT: v_writelane_b32 v61, s8, 52 -; SI-NEXT: v_writelane_b32 v61, s9, 53 -; SI-NEXT: v_writelane_b32 v61, s78, 54 -; SI-NEXT: v_writelane_b32 v61, s14, 55 -; SI-NEXT: v_writelane_b32 v61, s40, 56 -; SI-NEXT: v_writelane_b32 v61, s42, 57 -; SI-NEXT: v_writelane_b32 v61, s43, 58 -; SI-NEXT: v_writelane_b32 v61, s44, 59 -; SI-NEXT: v_writelane_b32 v61, s88, 60 -; SI-NEXT: v_writelane_b32 v61, s37, 61 -; SI-NEXT: v_writelane_b32 v61, s28, 62 -; SI-NEXT: v_writelane_b32 v61, s7, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s99, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s95, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s36, v31 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s8, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s9, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s91, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s93, v39 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s80, v48 +; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s13, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 +; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s79, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s83, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s36, v34 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s10, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s11, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s57, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s59, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s39, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s54, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s18, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s90, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v49 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s95, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s81, v51 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v43, s44, 37 +; SI-NEXT: v_writelane_b32 v43, s6, 38 +; SI-NEXT: v_writelane_b32 v43, s83, 39 +; SI-NEXT: v_writelane_b32 v43, s7, 40 +; SI-NEXT: v_writelane_b32 v43, s8, 41 +; SI-NEXT: v_writelane_b32 v43, s36, 42 +; SI-NEXT: v_writelane_b32 v43, s9, 43 +; SI-NEXT: v_writelane_b32 v43, s10, 44 +; SI-NEXT: v_writelane_b32 v43, s11, 45 +; SI-NEXT: v_writelane_b32 v43, s12, 46 +; SI-NEXT: v_writelane_b32 v43, s13, 47 +; SI-NEXT: v_writelane_b32 v43, s14, 48 +; SI-NEXT: v_writelane_b32 v43, s15, 49 +; SI-NEXT: v_writelane_b32 v43, s40, 50 +; SI-NEXT: v_writelane_b32 v43, s41, 51 +; SI-NEXT: v_writelane_b32 v43, s42, 52 +; SI-NEXT: v_writelane_b32 v43, s43, 53 +; SI-NEXT: v_writelane_b32 v43, s76, 54 +; SI-NEXT: v_writelane_b32 v43, s77, 55 +; SI-NEXT: v_writelane_b32 v43, s46, 56 +; SI-NEXT: v_writelane_b32 v43, s47, 57 +; SI-NEXT: v_writelane_b32 v43, s78, 58 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s29, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s92, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s17, v37 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s67, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s94, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s21, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s24, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s34, v51 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v14 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v12 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 59 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 60 +; SI-NEXT: v_writelane_b32 v43, s38, 61 +; SI-NEXT: v_writelane_b32 v43, s39, 62 +; SI-NEXT: v_writelane_b32 v43, s48, 63 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s86, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s63, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s74, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s25, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s23, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s35, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s31, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s72, v37 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s20, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s18, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s19, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s75, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s67, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s71, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s22, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s60, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s79, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s30, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s74, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s69, v34 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_readfirstlane_b32 s53, v50 +; SI-NEXT: v_writelane_b32 v42, s53, 6 +; SI-NEXT: v_writelane_b32 v42, s54, 7 +; SI-NEXT: v_writelane_b32 v42, s58, 8 +; SI-NEXT: v_readfirstlane_b32 s55, v51 +; SI-NEXT: v_writelane_b32 v42, s59, 9 +; SI-NEXT: v_writelane_b32 v42, s55, 10 +; SI-NEXT: v_writelane_b32 v42, s64, 11 +; SI-NEXT: v_writelane_b32 v42, s65, 12 +; SI-NEXT: v_writelane_b32 v42, s66, 13 +; SI-NEXT: v_writelane_b32 v42, s67, 14 +; SI-NEXT: v_writelane_b32 v42, s69, 15 +; SI-NEXT: v_writelane_b32 v42, s70, 16 +; SI-NEXT: v_writelane_b32 v42, s71, 17 +; SI-NEXT: v_writelane_b32 v42, s60, 18 +; SI-NEXT: v_writelane_b32 v42, s61, 19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s85, v36 +; SI-NEXT: v_writelane_b32 v42, s68, 20 +; SI-NEXT: v_writelane_b32 v42, s85, 21 +; SI-NEXT: v_writelane_b32 v42, s30, 22 +; SI-NEXT: v_writelane_b32 v42, s34, 23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s37, v38 +; SI-NEXT: v_writelane_b32 v42, s86, 24 +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_writelane_b32 v42, s37, 25 +; SI-NEXT: v_writelane_b32 v42, s87, 26 +; SI-NEXT: v_writelane_b32 v42, s20, 27 +; SI-NEXT: v_writelane_b32 v42, s84, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 -; SI-NEXT: v_writelane_b32 v62, s58, 4 -; SI-NEXT: v_writelane_b32 v62, s16, 5 -; SI-NEXT: v_writelane_b32 v62, s77, 6 -; SI-NEXT: v_writelane_b32 v62, s79, 7 -; SI-NEXT: v_writelane_b32 v62, s29, 8 -; SI-NEXT: v_writelane_b32 v62, s75, 9 -; SI-NEXT: v_writelane_b32 v62, s21, 10 -; SI-NEXT: v_writelane_b32 v62, s23, 11 -; SI-NEXT: v_writelane_b32 v62, s17, 12 -; SI-NEXT: v_writelane_b32 v62, s18, 13 -; SI-NEXT: v_writelane_b32 v62, s52, 14 -; SI-NEXT: v_writelane_b32 v62, s65, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s49, 17 -; SI-NEXT: v_writelane_b32 v62, s67, 18 -; SI-NEXT: v_writelane_b32 v62, s71, 19 -; SI-NEXT: v_writelane_b32 v62, s70, 20 -; SI-NEXT: v_writelane_b32 v62, s84, 21 -; SI-NEXT: v_writelane_b32 v62, s80, 22 -; SI-NEXT: v_writelane_b32 v62, s83, 23 -; SI-NEXT: v_writelane_b32 v62, s51, 24 -; SI-NEXT: v_writelane_b32 v62, s82, 25 -; SI-NEXT: v_writelane_b32 v62, s55, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s63, 28 -; SI-NEXT: v_writelane_b32 v62, s74, 29 -; SI-NEXT: v_writelane_b32 v62, s72, 30 -; SI-NEXT: v_writelane_b32 v62, s22, 31 -; SI-NEXT: v_writelane_b32 v62, s60, 32 +; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_writelane_b32 v42, s92, 29 +; SI-NEXT: v_writelane_b32 v42, s62, 30 +; SI-NEXT: v_readfirstlane_b32 s63, v48 +; SI-NEXT: v_writelane_b32 v42, s23, 31 +; SI-NEXT: v_writelane_b32 v42, s63, 32 +; SI-NEXT: v_writelane_b32 v42, s96, 33 +; SI-NEXT: v_writelane_b32 v42, s17, 34 +; SI-NEXT: v_writelane_b32 v42, s18, 35 +; SI-NEXT: v_writelane_b32 v42, s94, 36 +; SI-NEXT: v_writelane_b32 v42, s19, 37 +; SI-NEXT: v_writelane_b32 v42, s31, 38 +; SI-NEXT: v_writelane_b32 v42, s35, 39 +; SI-NEXT: v_writelane_b32 v42, s24, 40 +; SI-NEXT: v_writelane_b32 v42, s21, 41 +; SI-NEXT: v_writelane_b32 v42, s72, 42 +; SI-NEXT: v_writelane_b32 v42, s73, 43 +; SI-NEXT: v_writelane_b32 v42, s74, 44 +; SI-NEXT: v_writelane_b32 v42, s75, 45 +; SI-NEXT: v_writelane_b32 v42, s25, 46 +; SI-NEXT: v_writelane_b32 v42, s16, 47 +; SI-NEXT: v_writelane_b32 v42, s97, 48 +; SI-NEXT: v_writelane_b32 v42, s28, 49 +; SI-NEXT: v_writelane_b32 v42, s29, 50 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s73, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 9 -; SI-NEXT: v_readlane_b32 s5, v61, 8 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 7 -; SI-NEXT: v_readlane_b32 s5, v61, 6 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 5 -; SI-NEXT: v_readlane_b32 s5, v61, 4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 3 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 12 +; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 1 -; SI-NEXT: v_readlane_b32 s5, v61, 0 +; SI-NEXT: v_writelane_b32 v42, s4, 51 +; SI-NEXT: v_readlane_b32 s4, v43, 5 +; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s87, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s31, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s90, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s24, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s88, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s59, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s72, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s83, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_or_b32 s57, s4, s5 ; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s71, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s49, 0xff -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s18, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s54, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s11, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s26, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s69, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s62, s4, s5 ; SI-NEXT: s_and_b32 s4, s81, 0xff -; SI-NEXT: s_lshl_b32 s5, s45, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s89, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s99, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 35 -; SI-NEXT: v_readlane_b32 s5, v61, 34 -; SI-NEXT: s_mov_b32 s6, s99 -; SI-NEXT: s_mov_b32 s99, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s96, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 33 -; SI-NEXT: v_readlane_b32 s5, v61, 32 -; SI-NEXT: s_mov_b32 s55, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s86, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 31 -; SI-NEXT: v_readlane_b32 s5, v61, 30 -; SI-NEXT: s_mov_b32 s35, s87 -; SI-NEXT: s_mov_b32 s82, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s87, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 29 -; SI-NEXT: v_readlane_b32 s5, v61, 28 -; SI-NEXT: s_mov_b32 s83, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s51, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 27 -; SI-NEXT: v_readlane_b32 s5, v61, 26 -; SI-NEXT: s_mov_b32 s84, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s80, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 25 -; SI-NEXT: v_readlane_b32 s5, v61, 24 -; SI-NEXT: s_mov_b32 s71, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s70, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 23 -; SI-NEXT: v_readlane_b32 s5, v61, 22 -; SI-NEXT: s_mov_b32 s49, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s67, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 21 -; SI-NEXT: v_readlane_b32 s5, v61, 20 -; SI-NEXT: s_mov_b32 s65, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s64, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 19 -; SI-NEXT: v_readlane_b32 s5, v61, 18 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_mov_b32 s18, s54 -; SI-NEXT: s_mov_b32 s15, s50 -; SI-NEXT: s_mov_b32 s54, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s50, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 17 -; SI-NEXT: v_readlane_b32 s5, v61, 16 -; SI-NEXT: s_mov_b32 s23, s34 -; SI-NEXT: s_mov_b32 s14, s48 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s48, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 15 -; SI-NEXT: v_readlane_b32 s5, v61, 14 -; SI-NEXT: s_mov_b32 s52, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s75, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 13 -; SI-NEXT: v_readlane_b32 s5, v61, 12 -; SI-NEXT: s_mov_b32 s29, s30 -; SI-NEXT: s_mov_b32 s79, s92 -; SI-NEXT: s_mov_b32 s30, s4 +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 9 +; SI-NEXT: v_readlane_b32 s5, v43, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s92, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 11 -; SI-NEXT: v_readlane_b32 s5, v61, 10 -; SI-NEXT: s_mov_b32 s21, s39 -; SI-NEXT: s_mov_b32 s39, s4 +; SI-NEXT: s_or_b32 s5, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 7 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s77, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s38, s31 -; SI-NEXT: s_mov_b32 s16, s59 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s58, s57 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s41, s13 -; SI-NEXT: s_mov_b32 s28, s26 -; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s26, s25 -; SI-NEXT: s_mov_b32 s85, s97 -; SI-NEXT: s_mov_b32 s25, s69 -; SI-NEXT: s_mov_b32 s97, s19 -; SI-NEXT: s_mov_b32 s37, s66 -; SI-NEXT: s_mov_b32 s69, s81 -; SI-NEXT: s_mov_b32 s44, s45 -; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s36 -; SI-NEXT: s_mov_b32 s98, s76 -; SI-NEXT: s_mov_b32 s36, s89 -; SI-NEXT: s_mov_b32 s90, s68 -; SI-NEXT: s_mov_b32 s89, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_cbranch_execnz .LBB93_3 -; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s39, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 6 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s7, s6, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 11 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s30, 3 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 vcc_lo, s92, 8 -; SI-NEXT: s_or_b32 s5, vcc_lo, s5 -; SI-NEXT: s_add_i32 vcc_lo, s52, 3 -; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 -; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s34, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s60, s48, 8 -; SI-NEXT: s_or_b32 s60, s60, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s54, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s61, s50, 8 -; SI-NEXT: s_or_b32 s61, s61, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s65, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s62, s64, 8 -; SI-NEXT: s_or_b32 s62, s62, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s49, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s63, s67, 8 -; SI-NEXT: s_or_b32 s10, s63, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s71, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s72, s70, 8 -; SI-NEXT: s_or_b32 s72, s72, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s84, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s73, s80, 8 -; SI-NEXT: s_or_b32 s73, s73, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s83, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s51, 8 -; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s87, 8 -; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s55, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s86, 8 -; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s99, 3 -; SI-NEXT: s_add_i32 s95, s36, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: s_add_i32 s89, s89, 3 -; SI-NEXT: s_and_b32 s95, s95, 0xff -; SI-NEXT: s_lshl_b32 s88, s90, 8 -; SI-NEXT: s_add_i32 s36, s53, 3 -; SI-NEXT: s_or_b32 s77, s77, vcc_hi -; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s6, 8 -; SI-NEXT: s_or_b32 s22, s88, s95 -; SI-NEXT: s_and_b32 s95, s36, 0xff -; SI-NEXT: s_lshl_b32 s92, s98, 8 -; SI-NEXT: s_add_i32 s53, s66, 3 -; SI-NEXT: s_or_b32 s89, vcc_hi, s89 -; SI-NEXT: s_or_b32 s92, s92, s95 -; SI-NEXT: s_and_b32 s95, s53, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s14, 8 -; SI-NEXT: s_add_i32 s66, s69, 3 -; SI-NEXT: s_or_b32 s95, vcc_hi, s95 -; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s36, s44, 8 -; SI-NEXT: s_add_i32 s68, s97, 3 -; SI-NEXT: s_or_b32 vcc_hi, s36, vcc_hi -; SI-NEXT: s_and_b32 s36, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s37, 8 -; SI-NEXT: s_add_i32 s69, s85, 3 -; SI-NEXT: s_or_b32 s36, s39, s36 -; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s25, 8 -; SI-NEXT: s_add_i32 s81, s7, 3 -; SI-NEXT: s_or_b32 s39, s52, s39 -; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s26, 8 -; SI-NEXT: s_add_i32 s85, s41, 3 -; SI-NEXT: s_or_b32 s52, s53, s52 -; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s28, 8 -; SI-NEXT: s_add_i32 s97, s58, 3 -; SI-NEXT: s_or_b32 s53, s64, s53 -; SI-NEXT: s_and_b32 s64, s97, 0xff -; SI-NEXT: s_lshl_b32 s66, s12, 8 -; SI-NEXT: s_add_i32 s19, s79, 3 -; SI-NEXT: s_or_b32 s64, s66, s64 -; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s66, s56, 8 -; SI-NEXT: s_add_i32 s25, s23, 3 -; SI-NEXT: s_or_b32 s66, s66, s19 -; SI-NEXT: s_and_b32 s19, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s16, 8 -; SI-NEXT: s_add_i32 s26, s15, 3 -; SI-NEXT: s_or_b32 s67, s6, s19 -; SI-NEXT: s_and_b32 s6, s26, 0xff -; SI-NEXT: s_lshl_b32 s19, s29, 8 -; SI-NEXT: s_add_i32 s28, s18, 3 -; SI-NEXT: s_or_b32 s68, s19, s6 -; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s19, s21, 8 -; SI-NEXT: s_or_b32 s69, s19, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 12 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 16 -; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 7 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 13 -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 8 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: v_readlane_b32 s6, v43, 10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readlane_b32 s6, v43, 1 +; SI-NEXT: v_readlane_b32 s7, v43, 0 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 5 -; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 6 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 14 -; SI-NEXT: s_or_b32 s17, s17, s20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 4 +; SI-NEXT: s_or_b32 s7, s6, s7 ; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 18 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 3 -; SI-NEXT: s_add_i32 s13, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 15 -; SI-NEXT: s_or_b32 s18, s18, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 2 -; SI-NEXT: s_and_b32 s6, s13, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s20, s98, 0xff -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 -; SI-NEXT: s_and_b32 s27, s27, 0xff -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 1 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 -; SI-NEXT: s_or_b32 s23, s23, s27 -; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 0 -; SI-NEXT: s_and_b32 s6, s41, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s24, s86, 0xff -; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 27 -; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v61, 63 -; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 20 -; SI-NEXT: s_add_i32 s11, s72, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s72, v61, 62 -; SI-NEXT: s_and_b32 s6, s46, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s27, s82, 0xff -; SI-NEXT: s_lshl_b32 s72, s72, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 -; SI-NEXT: s_or_b32 s27, s72, s27 -; SI-NEXT: v_readlane_b32 s72, v61, 61 -; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s65, s72, 3 -; SI-NEXT: v_readlane_b32 s73, v61, 60 -; SI-NEXT: s_and_b32 s6, s47, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s72, s65, 0xff -; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 29 -; SI-NEXT: s_or_b32 s72, s73, s72 -; SI-NEXT: v_readlane_b32 s73, v61, 59 -; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 23 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s54, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v61, 58 -; SI-NEXT: s_and_b32 s6, s56, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s73, s54, 0xff -; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 57 -; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s74, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 56 -; SI-NEXT: s_and_b32 s6, s58, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s74, s50, 0xff -; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 32 -; SI-NEXT: s_or_b32 s74, s76, s74 -; SI-NEXT: v_readlane_b32 s76, v61, 55 -; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 26 -; SI-NEXT: s_add_i32 s19, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 54 -; SI-NEXT: s_and_b32 s6, s59, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s76, s48, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 30 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 53 -; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s75, s78, 0x300 -; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s78, v61, 52 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s77, s37, 0xff -; SI-NEXT: s_lshl_b32 s78, s78, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: s_or_b32 s77, s78, s77 -; SI-NEXT: v_readlane_b32 s78, v61, 51 -; SI-NEXT: s_add_i32 s21, s89, 0x300 -; SI-NEXT: s_add_i32 s89, s79, 0x300 -; SI-NEXT: s_add_i32 s34, s78, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 50 -; SI-NEXT: s_and_b32 s78, s34, 0xff -; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s78, s79, s78 -; SI-NEXT: v_readlane_b32 s79, v61, 49 -; SI-NEXT: v_readlane_b32 s90, v61, 48 -; SI-NEXT: s_add_i32 s25, s92, 0x300 -; SI-NEXT: s_add_i32 s30, s79, 3 -; SI-NEXT: s_lshl_b32 s92, s90, 8 -; SI-NEXT: v_readlane_b32 s90, v61, 47 -; SI-NEXT: s_and_b32 s79, s30, 0xff -; SI-NEXT: s_add_i32 s93, s90, 3 -; SI-NEXT: v_readlane_b32 s90, v61, 46 -; SI-NEXT: s_or_b32 s79, s92, s79 -; SI-NEXT: s_and_b32 s92, s93, 0xff -; SI-NEXT: s_lshl_b32 s91, s90, 8 -; SI-NEXT: v_readlane_b32 s90, v61, 45 -; SI-NEXT: s_or_b32 s91, s91, s92 -; SI-NEXT: s_add_i32 s90, s90, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 44 -; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s92, 8 -; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 43 -; SI-NEXT: s_add_i32 s92, s92, 3 -; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s38, 8 -; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 42 -; SI-NEXT: s_add_i32 s93, s93, 3 -; SI-NEXT: v_readlane_b32 s94, v61, 41 -; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s94, 8 -; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 40 -; SI-NEXT: s_add_i32 s94, s94, 3 -; SI-NEXT: s_add_i32 s26, s95, 0x300 -; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s35, 8 -; SI-NEXT: s_or_b32 s94, s95, s94 -; SI-NEXT: v_readlane_b32 s95, v61, 1 -; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 0 -; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 -; SI-NEXT: s_and_b32 s95, s95, 0xff -; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 3 -; SI-NEXT: s_or_b32 s95, vcc_lo, s95 -; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 -; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 -; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 5 -; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 4 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s30, s30, 8 -; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 7 -; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 6 -; SI-NEXT: s_and_b32 s30, s30, 0xff -; SI-NEXT: s_lshl_b32 s31, s31, 8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_hi -; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 9 -; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 -; SI-NEXT: s_and_b32 s31, s31, 0xff -; SI-NEXT: s_lshl_b32 s34, s34, 8 -; SI-NEXT: s_or_b32 s31, s34, s31 -; SI-NEXT: v_readlane_b32 s34, v61, 39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_lo -; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 38 -; SI-NEXT: s_and_b32 s34, s34, 0xff -; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 s95, 0x300 -; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 37 -; SI-NEXT: s_add_i32 s29, s36, 0x300 -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 36 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s95 -; SI-NEXT: s_and_b32 s35, s35, 0xff -; SI-NEXT: s_lshl_b32 s36, s36, 8 -; SI-NEXT: s_or_b32 s35, s36, s35 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_add_i32 s7, s60, 0x300 -; SI-NEXT: s_add_i32 s8, s61, 0x300 -; SI-NEXT: s_add_i32 s9, s62, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_add_i32 s40, s39, 0x300 -; SI-NEXT: s_add_i32 s41, s52, 0x300 -; SI-NEXT: s_add_i32 s42, s53, 0x300 -; SI-NEXT: s_add_i32 s43, s64, 0x300 -; SI-NEXT: s_add_i32 s44, s66, 0x300 -; SI-NEXT: s_add_i32 s45, s67, 0x300 -; SI-NEXT: s_add_i32 s46, s68, 0x300 -; SI-NEXT: s_add_i32 s47, s69, 0x300 -; SI-NEXT: s_add_i32 s56, s70, 0x300 -; SI-NEXT: s_add_i32 s57, s71, 0x300 -; SI-NEXT: s_add_i32 s58, s81, 0x300 -; SI-NEXT: s_add_i32 s59, s83, 0x300 -; SI-NEXT: s_add_i32 s60, s85, 0x300 -; SI-NEXT: s_add_i32 s61, s96, 0x300 -; SI-NEXT: s_add_i32 s62, s97, 0x300 -; SI-NEXT: s_addk_i32 s63, 0x300 -; SI-NEXT: s_addk_i32 s88, 0x300 -; SI-NEXT: s_addk_i32 s23, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_addk_i32 s27, 0x300 -; SI-NEXT: s_addk_i32 s72, 0x300 -; SI-NEXT: s_addk_i32 s73, 0x300 -; SI-NEXT: s_addk_i32 s74, 0x300 -; SI-NEXT: s_addk_i32 s76, 0x300 -; SI-NEXT: s_addk_i32 s77, 0x300 -; SI-NEXT: s_addk_i32 s78, 0x300 -; SI-NEXT: s_addk_i32 s79, 0x300 -; SI-NEXT: s_addk_i32 s91, 0x300 -; SI-NEXT: s_addk_i32 s90, 0x300 -; SI-NEXT: s_addk_i32 s92, 0x300 -; SI-NEXT: s_addk_i32 s93, 0x300 -; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s30, 0x300 -; SI-NEXT: s_addk_i32 s31, 0x300 -; SI-NEXT: s_addk_i32 s34, 0x300 -; SI-NEXT: s_addk_i32 s35, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v27 -; SI-NEXT: v_or_b32_e32 v15, v19, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v37 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v38 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v49 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v43 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v41 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v45 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v58 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_or_b32_e32 v30, v33, v30 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: s_mov_b32 s18, s54 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_mov_b32 s15, s50 -; SI-NEXT: s_mov_b32 s23, s34 -; SI-NEXT: s_mov_b32 s21, s39 -; SI-NEXT: s_mov_b32 s29, s30 -; SI-NEXT: s_mov_b32 s79, s92 -; SI-NEXT: s_mov_b32 s16, s59 -; SI-NEXT: s_mov_b32 s58, s57 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s41, s13 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s28, s26 -; SI-NEXT: s_mov_b32 s26, s25 -; SI-NEXT: s_mov_b32 s85, s97 -; SI-NEXT: s_mov_b32 s97, s19 -; SI-NEXT: s_mov_b32 s25, s69 -; SI-NEXT: s_mov_b32 s69, s81 -; SI-NEXT: s_mov_b32 s37, s66 -; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s36 -; SI-NEXT: s_mov_b32 s36, s89 -; SI-NEXT: s_mov_b32 s89, s95 -; SI-NEXT: s_mov_b32 s44, s45 -; SI-NEXT: s_mov_b32 s14, s48 -; SI-NEXT: s_mov_b32 s98, s76 -; SI-NEXT: s_mov_b32 s90, s68 -; SI-NEXT: s_mov_b32 s38, s31 -; SI-NEXT: s_mov_b32 s6, s99 -; SI-NEXT: s_mov_b32 s35, s87 -; SI-NEXT: v_readlane_b32 s99, v61, 35 -; SI-NEXT: v_readlane_b32 s96, v61, 34 -; SI-NEXT: v_readlane_b32 s55, v61, 33 -; SI-NEXT: v_readlane_b32 s82, v61, 31 -; SI-NEXT: v_readlane_b32 s86, v61, 32 -; SI-NEXT: v_readlane_b32 s83, v61, 29 -; SI-NEXT: v_readlane_b32 s87, v61, 30 -; SI-NEXT: v_readlane_b32 s84, v61, 27 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s80, v61, 26 -; SI-NEXT: v_readlane_b32 s71, v61, 25 -; SI-NEXT: v_readlane_b32 s49, v61, 23 -; SI-NEXT: v_readlane_b32 s70, v61, 24 -; SI-NEXT: v_readlane_b32 s65, v61, 21 -; SI-NEXT: v_readlane_b32 s67, v61, 22 -; SI-NEXT: v_readlane_b32 s54, v61, 19 -; SI-NEXT: v_readlane_b32 s64, v61, 20 -; SI-NEXT: v_readlane_b32 s50, v61, 18 -; SI-NEXT: v_readlane_b32 s34, v61, 17 -; SI-NEXT: v_readlane_b32 s52, v61, 15 -; SI-NEXT: v_readlane_b32 s48, v61, 16 -; SI-NEXT: v_readlane_b32 s30, v61, 13 -; SI-NEXT: v_readlane_b32 s39, v61, 11 -; SI-NEXT: v_readlane_b32 s92, v61, 12 -; SI-NEXT: v_readlane_b32 s77, v61, 10 -; SI-NEXT: v_readlane_b32 s75, v61, 14 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s13, 24 +; SI-NEXT: s_or_b32 s9, s8, s6 +; SI-NEXT: v_readlane_b32 s6, v43, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v43, 2 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_writelane_b32 v42, s9, 53 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 +; SI-NEXT: s_or_b32 s9, s8, s9 +; SI-NEXT: s_and_b32 s8, s14, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s15, 24 +; SI-NEXT: s_or_b32 s11, s10, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_writelane_b32 v42, s11, 54 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s11, s78, 8 +; SI-NEXT: s_or_b32 s11, s10, s11 +; SI-NEXT: s_and_b32 s10, s76, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s77, 24 +; SI-NEXT: s_or_b32 s13, s12, s10 +; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s38, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v42, s13, 55 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s49, 8 +; SI-NEXT: s_or_b32 s13, s12, s13 +; SI-NEXT: s_and_b32 s12, s39, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s48, 24 +; SI-NEXT: s_or_b32 s27, s14, s12 +; SI-NEXT: s_and_b32 s12, s51, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s52, 24 +; SI-NEXT: s_or_b32 s12, s14, s12 +; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: s_lshl_b32 s15, s55, 8 +; SI-NEXT: s_or_b32 s15, s14, s15 +; SI-NEXT: s_and_b32 s14, s53, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_or_b32 s26, s16, s14 +; SI-NEXT: s_and_b32 s14, s65, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s25, s66, 24 +; SI-NEXT: s_or_b32 s14, s25, s14 +; SI-NEXT: s_and_b32 s25, s85, 0xff +; SI-NEXT: s_lshl_b32 s40, s68, 8 +; SI-NEXT: s_or_b32 s41, s25, s40 +; SI-NEXT: s_and_b32 s25, s69, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s70, 24 +; SI-NEXT: s_or_b32 s16, s40, s25 +; SI-NEXT: s_and_b32 s40, s37, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_lshl_b32 s42, s87, 24 +; SI-NEXT: s_or_b32 s40, s42, s40 +; SI-NEXT: s_and_b32 s42, s20, 0xff +; SI-NEXT: s_lshl_b32 s43, s30, 8 +; SI-NEXT: s_or_b32 s43, s42, s43 +; SI-NEXT: s_and_b32 s42, s71, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s67, 24 +; SI-NEXT: s_or_b32 s69, s76, s42 +; SI-NEXT: s_and_b32 s42, s19, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s18, 24 +; SI-NEXT: s_or_b32 s42, s76, s42 +; SI-NEXT: s_and_b32 s76, s96, 0xff +; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s86, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s34, 24 +; SI-NEXT: s_or_b32 s70, s78, s77 +; SI-NEXT: s_and_b32 s77, s31, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s35, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s78, s77 +; SI-NEXT: s_or_b32 vcc_hi, s76, s70 +; SI-NEXT: s_and_b32 s76, s94, 0xff +; SI-NEXT: s_lshl_b32 s77, s17, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s84, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s92, 24 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_or_b32 s71, s78, s77 +; SI-NEXT: s_and_b32 s77, s24, 0xff +; SI-NEXT: s_or_b32 s41, s41, s16 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s21, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 36 +; SI-NEXT: s_or_b32 s38, s78, s77 +; SI-NEXT: s_or_b32 s39, s76, s71 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: s_lshl_b32 s77, s99, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 34 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s80, 56 +; SI-NEXT: s_or_b32 s80, s78, s77 +; SI-NEXT: s_and_b32 s77, s97, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 33 +; SI-NEXT: s_or_b32 s48, s78, s77 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s98, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 32 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 31 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s81, 57 +; SI-NEXT: s_or_b32 s81, s78, s77 +; SI-NEXT: s_and_b32 s77, s89, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s90, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 30 +; SI-NEXT: s_or_b32 s50, s78, s77 +; SI-NEXT: s_or_b32 s51, s76, s81 +; SI-NEXT: s_and_b32 s76, s79, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 29 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s82, 24 +; SI-NEXT: v_writelane_b32 v42, s82, 58 +; SI-NEXT: s_or_b32 s82, s78, s77 +; SI-NEXT: s_and_b32 s77, s36, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s83, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 26 +; SI-NEXT: s_or_b32 s52, s78, s77 +; SI-NEXT: s_or_b32 s53, s76, s82 +; SI-NEXT: s_and_b32 s76, s91, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 25 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 28 +; SI-NEXT: s_or_b32 s83, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 27 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 21 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s55, s76, s83 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 20 +; SI-NEXT: s_or_b32 s54, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 19 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: s_or_b32 s84, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 22 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 16 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s65, s76, s84 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 15 +; SI-NEXT: s_or_b32 s64, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 14 +; SI-NEXT: v_writelane_b32 v42, s93, 59 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 13 +; SI-NEXT: v_writelane_b32 v42, s90, 60 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_or_b32 s96, s78, s77 +; SI-NEXT: s_and_b32 s77, s44, 0xff +; SI-NEXT: v_readlane_b32 s25, v43, 17 +; SI-NEXT: v_readlane_b32 s16, v42, 51 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 24 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshr_b64 s[16:17], vcc, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s66, s78, s77 +; SI-NEXT: s_mov_b32 s77, s22 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s22, s73, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 52 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_or_b32 s15, s15, s26 +; SI-NEXT: s_mov_b32 s93, s88 +; SI-NEXT: s_mov_b32 s88, s98 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s98, s62, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s8 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_or_b32 s8, s22, s54 +; SI-NEXT: s_mov_b32 s22, s77 +; SI-NEXT: s_lshr_b32 s77, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 53 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_mov_b32 s89, s79 +; SI-NEXT: s_mov_b32 s79, s91 +; SI-NEXT: s_mov_b32 s91, s99 +; SI-NEXT: s_or_b32 s67, s76, s96 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xffff +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_and_b32 s30, s58, 0xffff +; SI-NEXT: s_and_b32 s86, s61, 0xffff +; SI-NEXT: s_and_b32 s85, s63, 0xffff +; SI-NEXT: s_and_b32 s87, s72, 0xffff +; SI-NEXT: s_and_b32 s68, s74, 0xffff +; SI-NEXT: s_and_b32 s99, s75, 0xffff +; SI-NEXT: s_or_b32 s74, s44, s4 +; SI-NEXT: s_mov_b32 s75, s5 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 +; SI-NEXT: s_or_b32 s72, s45, s6 +; SI-NEXT: s_mov_b32 s73, s7 +; SI-NEXT: s_lshr_b64 s[20:21], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 54 +; SI-NEXT: s_or_b32 s43, s43, s69 +; SI-NEXT: s_and_b32 s34, s59, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff +; SI-NEXT: s_mov_b32 s63, s9 +; SI-NEXT: s_or_b32 s60, s47, s10 +; SI-NEXT: s_mov_b32 s61, s11 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_or_b32 s58, s56, s12 +; SI-NEXT: s_mov_b32 s59, s13 +; SI-NEXT: s_or_b32 s56, s57, s14 +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_or_b32 s46, s30, s40 +; SI-NEXT: s_mov_b32 s94, s6 +; SI-NEXT: s_mov_b32 s92, s4 +; SI-NEXT: s_mov_b32 s47, s41 +; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16 +; SI-NEXT: s_or_b32 s40, s86, s38 +; SI-NEXT: s_mov_b32 s41, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 +; SI-NEXT: s_or_b32 s14, s98, s48 +; SI-NEXT: s_mov_b32 s15, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 +; SI-NEXT: s_or_b32 s12, s85, s50 +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_or_b32 s10, s87, s52 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_or_b32 s6, s68, s64 +; SI-NEXT: s_mov_b32 s7, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 +; SI-NEXT: s_or_b32 s4, s99, s66 +; SI-NEXT: s_mov_b32 s5, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 55 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b32 s67, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s27, 16 +; SI-NEXT: s_lshr_b32 s55, s26, 16 +; SI-NEXT: s_lshr_b32 s36, s37, 16 +; SI-NEXT: s_lshr_b32 s69, s69, 16 +; SI-NEXT: s_lshr_b32 s65, s70, 16 +; SI-NEXT: s_lshr_b32 s71, s71, 16 +; SI-NEXT: s_lshr_b32 s37, s80, 16 +; SI-NEXT: v_readlane_b32 s80, v42, 56 +; SI-NEXT: s_lshr_b32 s39, s81, 16 +; SI-NEXT: v_readlane_b32 s81, v42, 57 +; SI-NEXT: s_lshr_b32 s49, s82, 16 +; SI-NEXT: v_readlane_b32 s82, v42, 58 +; SI-NEXT: s_lshr_b32 s51, s83, 16 +; SI-NEXT: s_mov_b32 s99, s91 +; SI-NEXT: s_mov_b32 s91, s79 +; SI-NEXT: s_mov_b32 s98, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: v_readlane_b32 s90, v42, 60 +; SI-NEXT: s_mov_b32 s88, s93 +; SI-NEXT: v_readlane_b32 s93, v42, 59 +; SI-NEXT: s_lshr_b32 s53, s84, 16 +; SI-NEXT: s_mov_b32 s68, s16 +; SI-NEXT: s_lshr_b32 s70, s96, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: v_readlane_b32 s4, v43, 40 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 38 +; SI-NEXT: v_readlane_b32 s6, v43, 37 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v43, 14 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 13 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 43 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s8, v43, 23 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_readlane_b32 s8, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v43, 19 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v43, 18 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s80, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 28 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s9, s93, 8 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v43, 27 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s91, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 26 +; SI-NEXT: v_readlane_b32 s11, v43, 25 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 45 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_readlane_b32 s11, v43, 44 +; SI-NEXT: v_readlane_b32 s12, v43, 42 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 39 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s79, 3 +; SI-NEXT: v_readlane_b32 s12, v43, 30 +; SI-NEXT: v_readlane_b32 s13, v43, 29 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s12, s82, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s12, s81, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s13, s95, 8 +; SI-NEXT: s_add_i32 s14, s89, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s90, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s98, 3 +; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s15, v43, 32 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v43, 31 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v42, 50 +; SI-NEXT: s_add_i32 s19, s14, 3 +; SI-NEXT: v_readlane_b32 s15, v42, 49 +; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: s_and_b32 s14, s19, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s16, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v43, 36 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s16, s99, 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v43, 34 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v42, 47 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 46 +; SI-NEXT: v_readlane_b32 s18, v42, 40 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s97, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 41 +; SI-NEXT: s_and_b32 s18, s97, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: s_add_i32 s85, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 28 +; SI-NEXT: s_and_b32 s17, s85, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 29 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s40, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 45 +; SI-NEXT: s_add_i32 s41, s17, 0x3000000 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 42 +; SI-NEXT: v_readlane_b32 s18, v42, 38 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s87, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 39 +; SI-NEXT: s_and_b32 s18, s87, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 33 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 31 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 24 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 23 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 44 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s18, v42, 37 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s86, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 35 +; SI-NEXT: s_and_b32 s18, s86, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 27 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 22 +; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 32 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 30 +; SI-NEXT: v_readlane_b32 s18, v42, 25 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 26 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 21 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 20 +; SI-NEXT: v_readlane_b32 s18, v42, 15 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 19 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s18, v42, 12 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 13 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 10 +; SI-NEXT: v_readlane_b32 s18, v42, 6 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 7 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 9 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 4 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 5 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 60 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 61 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 59 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 55 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 57 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s18, v43, 52 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 53 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 51 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s18, v43, 48 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 49 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 4 +; SI-NEXT: v_readlane_b32 s18, v43, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 2 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s72, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 46 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 47 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s22, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 12 +; SI-NEXT: v_readlane_b32 s18, v43, 11 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 10 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 9 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 6 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s75, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s77, s75, 16 +; SI-NEXT: s_lshr_b32 s76, s73, 16 +; SI-NEXT: s_lshr_b32 s78, s63, 16 +; SI-NEXT: s_lshr_b32 s67, s61, 16 +; SI-NEXT: s_lshr_b32 s27, s59, 16 +; SI-NEXT: s_lshr_b32 s55, s57, 16 +; SI-NEXT: s_lshr_b32 s36, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s45, 16 +; SI-NEXT: s_lshr_b32 s65, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s39, s13, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s51, s9, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 16 +; SI-NEXT: s_lshr_b32 s70, s5, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s16, s74, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s75, 0xffff +; SI-NEXT: s_lshl_b32 s19, s77, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s73, 0xffff +; SI-NEXT: s_lshl_b32 s21, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s62, 0xffff +; SI-NEXT: s_lshl_b32 s22, s24, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s63, 0xffff +; SI-NEXT: s_lshl_b32 s23, s78, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s60, 0xffff +; SI-NEXT: s_lshl_b32 s24, s28, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s61, 0xffff +; SI-NEXT: s_lshl_b32 s25, s67, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s58, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s59, 0xffff +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s56, 0xffff +; SI-NEXT: s_lshl_b32 s28, s94, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s57, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_lshl_b32 s46, s30, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_and_b32 s46, s47, 0xffff +; SI-NEXT: s_lshl_b32 s47, s36, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s47, s34, 16 +; SI-NEXT: s_or_b32 s44, s44, s47 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s47, s69, 16 +; SI-NEXT: s_or_b32 s45, s45, s47 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s47, s68, 16 +; SI-NEXT: s_or_b32 s42, s42, s47 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s47, s65, 16 +; SI-NEXT: s_or_b32 s43, s43, s47 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s47, s38, 16 +; SI-NEXT: s_or_b32 s40, s40, s47 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s47, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s47 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s47, s48, 16 +; SI-NEXT: s_or_b32 s14, s14, s47 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s47, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s47 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s47, s50, 16 +; SI-NEXT: s_or_b32 s12, s12, s47 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s47, s39, 16 +; SI-NEXT: s_or_b32 s13, s13, s47 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s47, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s47 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s47, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s47 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s47, s54, 16 +; SI-NEXT: s_or_b32 s8, s8, s47 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s47, s51, 16 +; SI-NEXT: s_or_b32 s9, s9, s47 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s47, s64, 16 +; SI-NEXT: s_or_b32 s6, s6, s47 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s47, s53, 16 +; SI-NEXT: s_or_b32 s7, s7, s47 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s47, s66, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s17, s70, 16 +; SI-NEXT: s_or_b32 s4, s4, s47 +; SI-NEXT: s_or_b32 s5, s5, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: v_mov_b32_e32 v3, s20 +; SI-NEXT: v_mov_b32_e32 v4, s21 +; SI-NEXT: v_mov_b32_e32 v5, s22 +; SI-NEXT: v_mov_b32_e32 v6, s23 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: v_mov_b32_e32 v14, s44 +; SI-NEXT: v_mov_b32_e32 v15, s45 +; SI-NEXT: v_mov_b32_e32 v16, s42 +; SI-NEXT: v_mov_b32_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v18, s40 +; SI-NEXT: v_mov_b32_e32 v19, s41 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s99, v41, 35 +; SI-NEXT: v_readlane_b32 s98, v41, 34 +; SI-NEXT: v_readlane_b32 s97, v41, 33 +; SI-NEXT: v_readlane_b32 s96, v41, 32 +; SI-NEXT: v_readlane_b32 s87, v41, 31 +; SI-NEXT: v_readlane_b32 s86, v41, 30 +; SI-NEXT: v_readlane_b32 s85, v41, 29 +; SI-NEXT: v_readlane_b32 s84, v41, 28 +; SI-NEXT: v_readlane_b32 s83, v41, 27 +; SI-NEXT: v_readlane_b32 s82, v41, 26 +; SI-NEXT: v_readlane_b32 s81, v41, 25 +; SI-NEXT: v_readlane_b32 s80, v41, 24 +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: @@ -180887,853 +176880,708 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v31 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v13 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_mov_b32_e32 v14, v15 -; SI-NEXT: v_mov_b32_e32 v15, v18 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v44, v12, v5 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_or_b32_e32 v41, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v52, v2, v31 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v5, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v5, v31 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_or_b32_e32 v36, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v35, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_or_b32_e32 v33, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v31, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v62, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v60, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v58, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v56, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v46, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v44, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v41, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_or_b32_e32 v5, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v54, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_or_b32_e32 v25, v58, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; SI-NEXT: v_or_b32_e32 v26, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_or_b32_e32 v23, v61, v5 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v24, v60, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v21, v1, v5 -; SI-NEXT: v_mov_b32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v55, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v19, v40, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_or_b32_e32 v20, v2, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_or_b32_e32 v16, v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_or_b32_e32 v17, v42, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v3, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v41 +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v52 +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v50 +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v39 +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v38 +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v43, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v3, v12, 8, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v3, v13, 8, 8 -; SI-NEXT: v_mov_b32_e32 v13, v46 -; SI-NEXT: v_mov_b32_e32 v46, v1 -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v53, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, v18 -; SI-NEXT: v_mov_b32_e32 v18, v15 -; SI-NEXT: v_mov_b32_e32 v15, v14 -; SI-NEXT: v_mov_b32_e32 v14, v45 -; SI-NEXT: v_mov_b32_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v10, v47 +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -181769,342 +177617,344 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_alignbit_b32 v2, v17, v16, 16 +; SI-NEXT: v_mov_b32_e32 v45, v17 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: v_mov_b32_e32 v55, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_bfe_u32 v6, v4, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v2 -; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB94_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v42, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v54, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v17, v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_alignbit_b32 v46, v17, v16, 24 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_alignbit_b32 v47, v17, v16, 16 -; SI-NEXT: v_or_b32_e32 v19, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v20, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v43, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v41, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v21, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v44, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v23, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v24, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v46, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v25, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_or_b32_e32 v26, v2, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v56, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_or_b32_e32 v27, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_or_b32_e32 v22, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_or_b32_e32 v58, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v59, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v60, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v31, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v30, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v61, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_or_b32_e32 v32, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_or_b32_e32 v62, v1, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v33, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_or_b32_e32 v63, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v34, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_or_b32_e32 v31, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v32, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v33, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v36, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_or_b32_e32 v34, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v38, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_or_b32_e32 v48, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_or_b32_e32 v37, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_or_b32_e32 v36, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_or_b32_e32 v38, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v49, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v39, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v51, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v48, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v50, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v52, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v49, v1, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v54, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v51, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v53, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v50, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -182114,275 +177964,292 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v44, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_or_b32_e32 v41, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v52, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v53, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 -; SI-NEXT: v_mov_b32_e32 v6, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v17, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v19, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v21, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v23, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v25, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v27, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v6, v6, 8, 8 ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182393,14 +178260,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182411,14 +178278,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182429,14 +178296,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182447,14 +178314,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182463,14 +178330,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182481,32 +178348,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182517,30 +178382,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182551,14 +178418,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182567,14 +178434,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182585,30 +178452,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182619,30 +178488,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182653,30 +178524,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182687,30 +178560,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182721,33 +178598,37 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -182755,30 +178636,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182789,14 +178674,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182807,14 +178692,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182825,30 +178710,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182859,47 +178746,55 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v46 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -186165,1805 +182060,1712 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v50 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB95_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: s_or_b32 s44, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v28 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: s_or_b32 s45, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: v_writelane_b32 v62, s5, 5 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: v_writelane_b32 v62, s5, 3 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: v_readfirstlane_b32 s4, v60 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v21 -; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v44 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_or_b32 s43, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_writelane_b32 v62, s5, 11 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_writelane_b32 v62, s5, 9 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 -; SI-NEXT: s_or_b32 s40, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: s_or_b32 s41, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s5, 17 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v48 -; SI-NEXT: s_or_b32 s28, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: s_or_b32 s29, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s5, 23 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 20 -; SI-NEXT: v_writelane_b32 v62, s5, 21 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 18 -; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v49 -; SI-NEXT: s_or_b32 s26, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v45 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s27, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 24 -; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v40 -; SI-NEXT: s_or_b32 s24, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: v_mov_b32_e32 v13, v9 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s25, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v56 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_or_b32 s22, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v50 -; SI-NEXT: s_or_b32 s23, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: v_readfirstlane_b32 s4, v58 -; SI-NEXT: v_mov_b32_e32 v9, v51 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v9 -; SI-NEXT: s_or_b32 s20, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 -; SI-NEXT: s_or_b32 s21, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v35 -; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v31 -; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v26 -; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v32 -; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 -; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: v_mov_b32_e32 v51, v15 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v51 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v33 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v43 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v42 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v39 -; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v55 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v46 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v27 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s46, v1 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v2, v40 -; SI-NEXT: v_mov_b32_e32 v56, v23 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_bfe_u32 v35, v28, 8, 8 -; SI-NEXT: v_mov_b32_e32 v50, v44 -; SI-NEXT: v_bfe_u32 v32, v44, 8, 8 -; SI-NEXT: v_mov_b32_e32 v44, v54 -; SI-NEXT: v_bfe_u32 v31, v54, 8, 8 -; SI-NEXT: v_mov_b32_e32 v54, v52 -; SI-NEXT: v_bfe_u32 v29, v52, 8, 8 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_bfe_u32 v49, v45, 8, 8 -; SI-NEXT: v_bfe_u32 v25, v12, 8, 8 -; SI-NEXT: v_mov_b32_e32 v12, v11 -; SI-NEXT: v_bfe_u32 v21, v11, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v10, 8, 8 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_bfe_u32 v0, v8, 8, 8 -; SI-NEXT: v_mov_b32_e32 v45, v36 -; SI-NEXT: v_bfe_u32 v19, v36, 8, 8 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_bfe_u32 v40, v7, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s5, s46, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_mov_b32_e32 v7, v43 -; SI-NEXT: v_bfe_u32 v61, v43, 8, 8 -; SI-NEXT: v_mov_b32_e32 v43, v3 -; SI-NEXT: v_bfe_u32 v60, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v41, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v53, 8, 8 -; SI-NEXT: v_bfe_u32 v42, v14, 8, 8 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v20, v53 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_mov_b32_e32 v39, v9 -; SI-NEXT: v_mov_b32_e32 v18, v59 -; SI-NEXT: s_branch .LBB95_3 -; SI-NEXT: .LBB95_2: -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: v_writelane_b32 v62, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v23 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: v_writelane_b32 v62, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v50, v44 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v44, v54 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_writelane_b32 v62, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v54, v52 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_writelane_b32 v62, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v11 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s5, 17 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v45, v36 -; SI-NEXT: v_writelane_b32 v62, s4, 18 -; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_writelane_b32 v62, s4, 20 -; SI-NEXT: v_writelane_b32 v62, s5, 21 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v7, v43 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s5, 23 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v43, v3 -; SI-NEXT: v_writelane_b32 v62, s4, 24 -; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v40 -; SI-NEXT: v_writelane_b32 v62, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v14, v9 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v20, v53 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v18, v59 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_writelane_b32 v62, s81, 49 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v13, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, vcc -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v10 -; SI-NEXT: v_mov_b32_e32 v8, v12 -; SI-NEXT: s_cbranch_vccnz .LBB95_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v34, s30, 0 +; SI-NEXT: v_writelane_b32 v34, s31, 1 +; SI-NEXT: v_writelane_b32 v34, s34, 2 +; SI-NEXT: v_writelane_b32 v34, s35, 3 +; SI-NEXT: v_writelane_b32 v34, s36, 4 +; SI-NEXT: v_writelane_b32 v34, s37, 5 +; SI-NEXT: v_writelane_b32 v34, s38, 6 +; SI-NEXT: v_writelane_b32 v34, s39, 7 +; SI-NEXT: v_writelane_b32 v34, s48, 8 +; SI-NEXT: v_writelane_b32 v34, s49, 9 +; SI-NEXT: v_writelane_b32 v34, s50, 10 +; SI-NEXT: v_writelane_b32 v34, s51, 11 +; SI-NEXT: v_writelane_b32 v34, s52, 12 +; SI-NEXT: v_writelane_b32 v34, s53, 13 +; SI-NEXT: v_writelane_b32 v34, s54, 14 +; SI-NEXT: v_writelane_b32 v34, s55, 15 +; SI-NEXT: v_writelane_b32 v34, s64, 16 +; SI-NEXT: v_writelane_b32 v34, s65, 17 +; SI-NEXT: v_writelane_b32 v34, s66, 18 +; SI-NEXT: v_writelane_b32 v34, s67, 19 +; SI-NEXT: v_writelane_b32 v34, s68, 20 +; SI-NEXT: v_writelane_b32 v34, s69, 21 +; SI-NEXT: v_writelane_b32 v34, s70, 22 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v34, s71, 23 +; SI-NEXT: s_lshr_b32 s7, s22, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v37, s6, 0 +; SI-NEXT: v_writelane_b32 v34, s80, 24 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_writelane_b32 v37, s7, 2 +; SI-NEXT: v_writelane_b32 v34, s81, 25 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: v_writelane_b32 v37, s8, 4 +; SI-NEXT: v_writelane_b32 v34, s82, 26 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_writelane_b32 v37, s9, 6 +; SI-NEXT: v_writelane_b32 v34, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s11, v20 +; SI-NEXT: v_writelane_b32 v37, s10, 8 +; SI-NEXT: v_writelane_b32 v34, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_writelane_b32 v37, s11, 10 +; SI-NEXT: v_writelane_b32 v34, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_writelane_b32 v37, s12, 11 +; SI-NEXT: v_writelane_b32 v34, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s71, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_writelane_b32 v37, s13, 12 +; SI-NEXT: v_writelane_b32 v34, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_writelane_b32 v37, s14, 13 +; SI-NEXT: v_writelane_b32 v34, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_writelane_b32 v37, s15, 14 +; SI-NEXT: v_writelane_b32 v34, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s34, v12 +; SI-NEXT: v_writelane_b32 v37, s93, 15 +; SI-NEXT: v_writelane_b32 v34, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s38, v11 +; SI-NEXT: v_writelane_b32 v37, s34, 16 +; SI-NEXT: v_writelane_b32 v34, s99, 35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b32 s90, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s88, s25, 16 +; SI-NEXT: s_lshr_b32 s79, s23, 16 +; SI-NEXT: s_lshr_b32 s78, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s19, 16 +; SI-NEXT: s_lshr_b32 s99, s18, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b32 s97, s16, 16 +; SI-NEXT: v_writelane_b32 v37, s38, 17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v7 +; SI-NEXT: v_writelane_b32 v37, s18, 18 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 19 +; SI-NEXT: v_readfirstlane_b32 s94, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_readfirstlane_b32 s64, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_readfirstlane_b32 s50, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s53, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s49, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_readfirstlane_b32 s37, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s51, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s81, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_writelane_b32 v37, s21, 20 +; SI-NEXT: v_readfirstlane_b32 s35, v2 +; SI-NEXT: v_readfirstlane_b32 s39, v1 +; SI-NEXT: v_readfirstlane_b32 s95, v18 +; SI-NEXT: v_readfirstlane_b32 s68, v17 +; SI-NEXT: v_readfirstlane_b32 s66, v16 +; SI-NEXT: v_readfirstlane_b32 s67, v15 +; SI-NEXT: v_readfirstlane_b32 s55, v14 +; SI-NEXT: v_readfirstlane_b32 s65, v13 +; SI-NEXT: v_readfirstlane_b32 s52, v12 +; SI-NEXT: v_readfirstlane_b32 s54, v11 +; SI-NEXT: v_readfirstlane_b32 s48, v10 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v9 +; SI-NEXT: v_readfirstlane_b32 s36, v8 +; SI-NEXT: v_readfirstlane_b32 s31, v6 +; SI-NEXT: v_readfirstlane_b32 s98, v5 +; SI-NEXT: v_readfirstlane_b32 s92, v19 +; SI-NEXT: v_readfirstlane_b32 s91, v3 +; SI-NEXT: v_writelane_b32 v37, s20, 21 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 22 +; SI-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB95_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s62, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s39, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s96, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_hi, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s34, 0xffff +; SI-NEXT: s_lshl_b32 s5, s52, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s49, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_lshr_b32 s93, s75, 8 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s53, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: v_writelane_b32 v35, s93, 1 +; SI-NEXT: s_lshr_b32 s93, s73, 8 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: v_writelane_b32 v35, s93, 0 +; SI-NEXT: s_lshr_b32 s93, s61, 8 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s64, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: v_writelane_b32 v36, s93, 63 +; SI-NEXT: s_lshr_b32 s93, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xffff +; SI-NEXT: s_lshl_b32 vcc_lo, s95, 16 +; SI-NEXT: v_writelane_b32 v36, s93, 62 +; SI-NEXT: s_lshr_b32 s93, s63, 8 +; SI-NEXT: s_or_b32 s5, s5, vcc_lo +; SI-NEXT: v_writelane_b32 v36, s93, 61 +; SI-NEXT: s_lshr_b32 vcc_lo, s59, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 60 +; SI-NEXT: s_lshr_b32 vcc_lo, s47, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 59 +; SI-NEXT: s_lshr_b32 vcc_lo, s45, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 58 +; SI-NEXT: s_lshr_b32 vcc_lo, s43, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 57 +; SI-NEXT: s_lshr_b32 vcc_lo, s41, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 56 +; SI-NEXT: s_lshr_b32 vcc_lo, s15, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 55 +; SI-NEXT: s_lshr_b32 vcc_lo, s13, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 54 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 53 +; SI-NEXT: s_lshr_b32 vcc_lo, s9, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 52 +; SI-NEXT: s_lshr_b32 vcc_lo, s7, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 51 +; SI-NEXT: s_lshr_b32 vcc_lo, s5, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 50 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 27 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 28 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 25 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 26 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 23 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 24 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 33 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 34 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 31 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 32 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 29 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 30 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 39 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 40 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 37 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 38 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 35 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 36 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 45 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 46 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 43 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 44 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 41 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 42 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 51 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 52 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 49 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 50 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 47 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 48 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 57 +; SI-NEXT: s_bfe_u32 s93, s78, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 58 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 16 +; SI-NEXT: v_writelane_b32 v36, s93, 49 +; SI-NEXT: s_bfe_u32 s93, s79, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 55 +; SI-NEXT: v_writelane_b32 v36, s93, 48 +; SI-NEXT: s_bfe_u32 s93, s88, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 56 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 8 +; SI-NEXT: v_writelane_b32 v36, s93, 47 +; SI-NEXT: s_bfe_u32 s93, s89, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 53 +; SI-NEXT: v_writelane_b32 v36, s93, 46 +; SI-NEXT: s_bfe_u32 s93, s90, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 54 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v36, s93, 45 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 63 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 0 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 61 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 62 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 59 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 60 +; SI-NEXT: s_lshr_b64 vcc, s[44:45], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 5 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 6 +; SI-NEXT: s_mov_b32 vcc_lo, s97 +; SI-NEXT: s_mov_b32 vcc_hi, s96 +; SI-NEXT: s_lshr_b64 s[96:97], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v36, s96, 3 +; SI-NEXT: v_writelane_b32 v36, s97, 4 +; SI-NEXT: s_mov_b32 s96, vcc_hi +; SI-NEXT: s_mov_b32 s97, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s99 +; SI-NEXT: s_mov_b32 vcc_hi, s98 +; SI-NEXT: s_lshr_b64 s[98:99], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v36, s98, 1 +; SI-NEXT: v_writelane_b32 v36, s99, 2 +; SI-NEXT: s_mov_b32 s98, vcc_hi +; SI-NEXT: s_mov_b32 s99, vcc_lo +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 11 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 12 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 9 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 10 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 7 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 8 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 17 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 18 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 15 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 16 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 13 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 14 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 23 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 24 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 21 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 22 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 19 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 20 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 29 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 30 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 27 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 28 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 25 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 26 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 35 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 36 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 33 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 34 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 31 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 32 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 41 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 42 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 39 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 40 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 37 +; SI-NEXT: s_mov_b32 s30, s53 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 38 +; SI-NEXT: s_mov_b32 vcc_lo, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[6:7], 24 +; SI-NEXT: s_mov_b32 s51, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s81 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 16 +; SI-NEXT: s_mov_b32 s81, vcc_lo +; SI-NEXT: s_lshr_b64 vcc, s[6:7], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 43 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 44 +; SI-NEXT: s_mov_b32 vcc_lo, s35 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 24 +; SI-NEXT: s_mov_b32 s35, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s71 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 +; SI-NEXT: s_mov_b32 s71, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 +; SI-NEXT: s_mov_b32 s20, s37 +; SI-NEXT: s_mov_b32 s37, s49 +; SI-NEXT: s_bfe_u32 s18, s76, 0x80008 +; SI-NEXT: s_bfe_u32 s21, s77, 0x80008 +; SI-NEXT: s_bfe_u32 s93, s91, 0x80008 +; SI-NEXT: s_bfe_u32 s49, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s82, s31, 0x80008 +; SI-NEXT: s_bfe_u32 s83, s36, 0x80008 +; SI-NEXT: s_bfe_u32 s84, s48, 0x80008 +; SI-NEXT: s_bfe_u32 s85, s52, 0x80008 +; SI-NEXT: s_bfe_u32 s69, s55, 0x80008 +; SI-NEXT: s_bfe_u32 s86, s66, 0x80008 +; SI-NEXT: s_bfe_u32 s87, s95, 0x80008 +; SI-NEXT: s_mov_b32 s39, vcc_lo +; SI-NEXT: s_cbranch_execnz .LBB95_4 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, s64 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s68 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_readlane_b32 s11, v37, 17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s67 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s30 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_bfe_u32 v42, v15, 8, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s66 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s53 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s65 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s7, v20 +; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_bfe_u32 v48, v20, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s37 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_readfirstlane_b32 s9, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_bfe_u32 v41, v53, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s52 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_readlane_b32 s12, v37, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_readlane_b32 s12, v37, 22 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s11, v43 +; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_bfe_u32 v60, v43, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s96 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_readlane_b32 s14, v37, 15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_readlane_b32 s14, v37, 19 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_readlane_b32 s15, v37, 13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s13, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_bfe_u32 v61, v5, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_readlane_b32 s40, v37, 12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s98 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s15, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_bfe_u32 v40, v1, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: v_bfe_u32 v19, v6, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s19, v7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s21, v4 -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s21, s22, s21 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s23, v8 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: v_bfe_u32 v21, v8, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: s_or_b32 s23, s24, s23 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_bfe_u32 v25, v2, 8, 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: s_or_b32 s25, s26, s25 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v52 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: v_bfe_u32 v49, v52, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_bfe_u32 v16, v4, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s29, v54 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: v_bfe_u32 v29, v54, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: s_or_b32 s29, s40, s29 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s81 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_readlane_b32 s42, v37, 14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s51 +; SI-NEXT: v_readfirstlane_b32 s41, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_readlane_b32 s43, v37, 11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_readfirstlane_b32 s42, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s41, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 ; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s41, v44 +; SI-NEXT: v_readfirstlane_b32 s41, v7 ; SI-NEXT: s_lshl_b32 s41, s41, 16 -; SI-NEXT: v_bfe_u32 v31, v44, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_readlane_b32 s44, v37, 10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s71 +; SI-NEXT: v_readfirstlane_b32 s43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s43, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 ; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s43, v50 +; SI-NEXT: v_readfirstlane_b32 s43, v8 ; SI-NEXT: s_lshl_b32 s43, s43, 16 -; SI-NEXT: v_bfe_u32 v32, v50, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s39 ; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s44, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s91 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_readlane_b32 s46, v37, 8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s35 +; SI-NEXT: v_readfirstlane_b32 s45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_readfirstlane_b32 s46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s90 ; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s45, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_readfirstlane_b32 s45, v9 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: v_bfe_u32 v35, v47, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s46, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_readlane_b32 s29, v37, 6 ; SI-NEXT: s_or_b32 s45, s46, s45 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_writelane_b32 v62, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: v_writelane_b32 v62, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 0 -; SI-NEXT: v_writelane_b32 v62, s47, 1 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 10 -; SI-NEXT: v_writelane_b32 v62, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 8 -; SI-NEXT: v_writelane_b32 v62, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 6 -; SI-NEXT: v_writelane_b32 v62, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 16 -; SI-NEXT: v_writelane_b32 v62, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 14 -; SI-NEXT: v_writelane_b32 v62, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 12 -; SI-NEXT: v_writelane_b32 v62, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 22 -; SI-NEXT: v_writelane_b32 v62, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 20 -; SI-NEXT: v_writelane_b32 v62, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 18 -; SI-NEXT: v_writelane_b32 v62, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 28 -; SI-NEXT: v_writelane_b32 v62, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 26 -; SI-NEXT: v_writelane_b32 v62, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 24 -; SI-NEXT: v_writelane_b32 v62, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 34 -; SI-NEXT: v_writelane_b32 v62, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 32 -; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 30 -; SI-NEXT: v_writelane_b32 v62, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 40 -; SI-NEXT: v_writelane_b32 v62, s47, 41 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 38 -; SI-NEXT: v_writelane_b32 v62, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 36 -; SI-NEXT: v_writelane_b32 v62, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 44 -; SI-NEXT: v_writelane_b32 v62, s47, 45 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 42 -; SI-NEXT: v_writelane_b32 v62, s47, 43 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: v_bfe_u32 v0, v7, 8, 8 -; SI-NEXT: .LBB95_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 -; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 -; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 -; SI-NEXT: s_lshl_b32 s57, vcc_lo, 24 -; SI-NEXT: s_lshl_b32 s47, s47, 16 -; SI-NEXT: s_or_b32 s47, s57, s47 -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: v_mov_b32_e32 v13, s44 -; SI-NEXT: s_and_b32 s44, s45, 0xff -; SI-NEXT: s_lshl_b32 s45, s71, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 -; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v35 -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v14, s44, v14 -; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: v_readlane_b32 s45, v62, 7 -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 -; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_readlane_b32 s44, v62, 8 -; SI-NEXT: v_readlane_b32 s45, v62, 9 -; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 -; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 -; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_mov_b32_e32 v15, s42 -; SI-NEXT: s_and_b32 s42, s43, 0xff -; SI-NEXT: s_lshl_b32 s43, s70, 8 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v50 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v17, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v32 -; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v18, s42, v18 -; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: v_readlane_b32 s43, v62, 13 -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_readlane_b32 s42, v62, 14 -; SI-NEXT: v_readlane_b32 s43, v62, 15 -; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s44, v62, 16 -; SI-NEXT: s_lshl_b32 s43, s44, 24 -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_mov_b32_e32 v20, s40 -; SI-NEXT: s_and_b32 s40, s41, 0xff -; SI-NEXT: s_lshl_b32 s41, s69, 8 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v44 -; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v31 -; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v22, s40, v22 -; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: v_readlane_b32 s41, v62, 19 -; SI-NEXT: s_lshl_b32 s40, s40, 8 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_readlane_b32 s40, v62, 20 -; SI-NEXT: v_readlane_b32 s41, v62, 21 -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 22 -; SI-NEXT: s_lshl_b32 s41, s42, 24 -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: s_and_b32 s28, s29, 0xff -; SI-NEXT: s_lshl_b32 s29, s68, 8 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v54 -; SI-NEXT: s_or_b32 s28, s28, s29 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v29 -; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_or_b32_e32 v26, s28, v26 -; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: v_readlane_b32 s29, v62, 25 -; SI-NEXT: s_lshl_b32 s28, s28, 8 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_readlane_b32 s28, v62, 26 -; SI-NEXT: v_readlane_b32 s29, v62, 27 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s40, v62, 28 -; SI-NEXT: s_lshl_b32 s29, s40, 24 +; SI-NEXT: v_readfirstlane_b32 s46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_mov_b32_e32 v28, s26 -; SI-NEXT: s_and_b32 s26, s27, 0xff -; SI-NEXT: s_lshl_b32 s27, s66, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v52 -; SI-NEXT: s_or_b32 s26, s26, s27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 -; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v12, s26, v12 -; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: v_readlane_b32 s27, v62, 31 -; SI-NEXT: s_lshl_b32 s26, s26, 8 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_readlane_b32 s27, v62, 33 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s28, v62, 34 -; SI-NEXT: s_lshl_b32 s27, s28, 24 +; SI-NEXT: v_readfirstlane_b32 s26, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s89 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: s_or_b32 s46, s46, s28 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_readfirstlane_b32 s28, v10 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: v_readlane_b32 s27, v37, 4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s47, s29, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v24 -; SI-NEXT: v_mov_b32_e32 v27, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xff -; SI-NEXT: s_lshl_b32 s25, s64, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v24 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v24 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: buffer_store_dword v18, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v24 -; SI-NEXT: v_or_b32_e32 v11, s24, v11 -; SI-NEXT: buffer_store_dword v20, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v24 -; SI-NEXT: v_readlane_b32 s24, v62, 36 -; SI-NEXT: buffer_store_dword v22, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v24 -; SI-NEXT: v_readlane_b32 s25, v62, 37 -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: buffer_store_dword v23, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v24 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_readlane_b32 s24, v62, 38 -; SI-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v24 -; SI-NEXT: v_readlane_b32 s25, v62, 39 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 s26, v62, 40 -; SI-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v24 +; SI-NEXT: s_or_b32 s58, s28, s26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s59, s27, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_readlane_b32 s25, v37, 2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s62, s26, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_lshl_b32 s25, s26, 24 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_or_b32 s63, s25, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s79 +; SI-NEXT: v_readlane_b32 s23, v37, 0 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_readlane_b32 s18, v37, 21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s56, s24, s22 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readlane_b32 s18, v37, 20 +; SI-NEXT: s_or_b32 s57, s23, s22 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s78 +; SI-NEXT: v_readlane_b32 s18, v37, 18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s99 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s60, s22, s20 +; SI-NEXT: v_readfirstlane_b32 s20, v20 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: s_or_b32 s61, s21, s20 +; SI-NEXT: v_readfirstlane_b32 s20, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s72, s20, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v23 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: s_or_b32 s73, s16, s18 +; SI-NEXT: s_lshr_b64 s[18:19], s[72:73], 16 +; SI-NEXT: v_writelane_b32 v37, s18, 31 +; SI-NEXT: v_writelane_b32 v37, s19, 32 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v37, s18, 63 +; SI-NEXT: v_writelane_b32 v36, s19, 0 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v37, s18, 61 +; SI-NEXT: v_writelane_b32 v37, s19, 62 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v37, s18, 59 +; SI-NEXT: v_writelane_b32 v37, s19, 60 +; SI-NEXT: s_lshr_b64 s[18:19], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v36, s18, 23 +; SI-NEXT: v_writelane_b32 v36, s19, 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v36, s18, 29 +; SI-NEXT: v_writelane_b32 v36, s19, 30 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v36, s18, 27 +; SI-NEXT: v_writelane_b32 v36, s19, 28 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v36, s18, 25 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s97 +; SI-NEXT: v_writelane_b32 v36, s19, 26 +; SI-NEXT: s_lshr_b64 s[70:71], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v36, s70, 33 +; SI-NEXT: v_writelane_b32 v36, s71, 34 +; SI-NEXT: s_lshr_b64 s[70:71], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v36, s70, 31 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_writelane_b32 v36, s71, 32 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_writelane_b32 v36, s70, 41 +; SI-NEXT: v_writelane_b32 v36, s71, 42 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v36, s70, 39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v15 +; SI-NEXT: v_writelane_b32 v36, s71, 40 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 8 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s76 +; SI-NEXT: v_writelane_b32 v36, s70, 37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_writelane_b32 v36, s71, 38 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v37, s70, 6 +; SI-NEXT: v_writelane_b32 v37, s71, 7 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 16 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v14 +; SI-NEXT: v_writelane_b32 v37, s70, 8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v15 +; SI-NEXT: v_writelane_b32 v37, s71, 9 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_writelane_b32 v36, s70, 43 +; SI-NEXT: v_writelane_b32 v36, s71, 44 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v37, s70, 0 +; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v26 +; SI-NEXT: v_writelane_b32 v37, s71, 1 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_writelane_b32 v37, s70, 4 +; SI-NEXT: s_or_b32 s75, s17, s16 +; SI-NEXT: v_writelane_b32 v37, s71, 5 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 8 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[72:73], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[58:59], 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v37, s70, 2 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[62:63], 8 +; SI-NEXT: s_lshr_b32 s21, s75, 8 +; SI-NEXT: s_lshr_b32 s95, s73, 8 +; SI-NEXT: s_lshr_b32 s91, s61, 8 +; SI-NEXT: s_lshr_b32 s79, s57, 8 +; SI-NEXT: s_lshr_b32 s77, s63, 8 +; SI-NEXT: s_lshr_b32 vcc_hi, s59, 8 +; SI-NEXT: s_lshr_b32 s51, s47, 8 +; SI-NEXT: s_lshr_b32 s39, s45, 8 +; SI-NEXT: s_lshr_b32 s37, s43, 8 +; SI-NEXT: s_lshr_b32 s35, s41, 8 +; SI-NEXT: s_lshr_b32 s29, s15, 8 +; SI-NEXT: s_lshr_b32 s28, s13, 8 +; SI-NEXT: s_lshr_b32 s27, s11, 8 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s93, s7, 8 +; SI-NEXT: s_lshr_b32 s25, s5, 8 +; SI-NEXT: v_bfe_u32 v32, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v30, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v28, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v27, v11, 8, 8 +; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v22, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v21, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v17, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v14, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v12, v1, 8, 8 +; SI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v37, s71, 3 +; SI-NEXT: s_mov_b32 s70, s18 +; SI-NEXT: s_branch .LBB95_5 +; SI-NEXT: .LBB95_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s30, s53 +; SI-NEXT: v_writelane_b32 v37, s4, 23 +; SI-NEXT: v_writelane_b32 v37, s5, 24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: v_writelane_b32 v37, s4, 25 +; SI-NEXT: v_writelane_b32 v37, s5, 26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s20, s37 +; SI-NEXT: v_writelane_b32 v37, s4, 27 +; SI-NEXT: v_writelane_b32 v37, s5, 28 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s37, s49 +; SI-NEXT: v_writelane_b32 v37, s4, 29 +; SI-NEXT: v_writelane_b32 v37, s5, 30 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 31 +; SI-NEXT: v_writelane_b32 v37, s5, 32 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: v_writelane_b32 v37, s4, 33 +; SI-NEXT: v_writelane_b32 v37, s5, 34 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 35 +; SI-NEXT: v_writelane_b32 v37, s5, 36 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 37 +; SI-NEXT: v_writelane_b32 v37, s5, 38 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 39 +; SI-NEXT: v_writelane_b32 v37, s5, 40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 41 +; SI-NEXT: v_writelane_b32 v37, s5, 42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 43 +; SI-NEXT: v_writelane_b32 v37, s5, 44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 45 +; SI-NEXT: v_writelane_b32 v37, s5, 46 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 47 +; SI-NEXT: v_writelane_b32 v37, s5, 48 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 49 +; SI-NEXT: v_writelane_b32 v37, s5, 50 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 51 +; SI-NEXT: v_writelane_b32 v37, s5, 52 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 53 +; SI-NEXT: v_writelane_b32 v37, s5, 54 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 55 +; SI-NEXT: v_writelane_b32 v37, s5, 56 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 57 +; SI-NEXT: v_writelane_b32 v37, s5, 58 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 59 +; SI-NEXT: v_writelane_b32 v37, s5, 60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 61 +; SI-NEXT: v_writelane_b32 v37, s5, 62 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 63 +; SI-NEXT: v_writelane_b32 v36, s5, 0 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 1 +; SI-NEXT: v_writelane_b32 v36, s5, 2 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 3 +; SI-NEXT: v_writelane_b32 v36, s5, 4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 5 +; SI-NEXT: v_writelane_b32 v36, s5, 6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 7 +; SI-NEXT: v_writelane_b32 v36, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 9 +; SI-NEXT: v_writelane_b32 v36, s5, 10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 11 +; SI-NEXT: v_writelane_b32 v36, s5, 12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 13 +; SI-NEXT: v_writelane_b32 v36, s5, 14 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 15 +; SI-NEXT: v_writelane_b32 v36, s5, 16 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 17 +; SI-NEXT: v_writelane_b32 v36, s5, 18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 19 +; SI-NEXT: v_writelane_b32 v36, s5, 20 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 21 +; SI-NEXT: v_writelane_b32 v36, s5, 22 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 23 +; SI-NEXT: v_writelane_b32 v36, s5, 24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 25 +; SI-NEXT: v_writelane_b32 v36, s5, 26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 27 +; SI-NEXT: v_writelane_b32 v36, s5, 28 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 29 +; SI-NEXT: v_writelane_b32 v36, s5, 30 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 31 +; SI-NEXT: v_writelane_b32 v36, s5, 32 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 33 +; SI-NEXT: v_writelane_b32 v36, s5, 34 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 35 +; SI-NEXT: v_writelane_b32 v36, s5, 36 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 37 +; SI-NEXT: v_writelane_b32 v36, s5, 38 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 39 +; SI-NEXT: v_writelane_b32 v36, s5, 40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 41 +; SI-NEXT: v_writelane_b32 v36, s5, 42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 43 +; SI-NEXT: v_writelane_b32 v36, s5, 44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_branch .LBB95_2 +; SI-NEXT: .LBB95_4: +; SI-NEXT: v_mov_b32_e32 v16, s79 +; SI-NEXT: v_mov_b32_e32 v20, s78 +; SI-NEXT: v_mov_b32_e32 v23, s77 +; SI-NEXT: v_mov_b32_e32 v26, s76 +; SI-NEXT: v_readlane_b32 s76, v37, 39 +; SI-NEXT: v_readlane_b32 s78, v37, 37 +; SI-NEXT: v_readlane_b32 s77, v37, 40 +; SI-NEXT: v_readlane_b32 s79, v37, 38 +; SI-NEXT: v_writelane_b32 v37, s34, 0 +; SI-NEXT: v_writelane_b32 v37, s35, 1 +; SI-NEXT: v_mov_b32_e32 v9, s91 +; SI-NEXT: v_mov_b32_e32 v10, s90 +; SI-NEXT: v_readlane_b32 s16, v36, 45 +; SI-NEXT: v_readlane_b32 s90, v37, 33 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 46 +; SI-NEXT: v_readlane_b32 s91, v37, 34 +; SI-NEXT: v_writelane_b32 v37, s70, 4 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 47 +; SI-NEXT: v_writelane_b32 v37, s71, 5 +; SI-NEXT: v_mov_b32_e32 v1, s95 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 48 +; SI-NEXT: v_readlane_b32 s94, v37, 35 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 49 +; SI-NEXT: v_readlane_b32 s95, v37, 36 +; SI-NEXT: v_writelane_b32 v37, s38, 2 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 60 +; SI-NEXT: v_writelane_b32 v37, s39, 3 +; SI-NEXT: s_mov_b32 vcc_hi, s16 +; SI-NEXT: v_readlane_b32 s16, v37, 27 +; SI-NEXT: v_readlane_b32 s34, v37, 29 +; SI-NEXT: v_readlane_b32 s17, v37, 28 +; SI-NEXT: v_readlane_b32 s35, v37, 30 +; SI-NEXT: v_writelane_b32 v37, s50, 6 +; SI-NEXT: v_writelane_b32 v37, s51, 7 +; SI-NEXT: v_readlane_b32 s38, v37, 25 +; SI-NEXT: v_readlane_b32 s39, v37, 26 +; SI-NEXT: v_writelane_b32 v37, s80, 8 +; SI-NEXT: v_writelane_b32 v37, s81, 9 +; SI-NEXT: v_mov_b32_e32 v6, s36 +; SI-NEXT: v_mov_b32_e32 v31, s21 +; SI-NEXT: v_readlane_b32 s20, v36, 56 +; SI-NEXT: v_readlane_b32 s21, v36, 58 +; SI-NEXT: v_readlane_b32 s80, v36, 35 +; SI-NEXT: v_readlane_b32 s36, v37, 53 +; SI-NEXT: v_readlane_b32 s50, v37, 51 +; SI-NEXT: v_mov_b32_e32 v8, s92 +; SI-NEXT: v_mov_b32_e32 v24, s93 +; SI-NEXT: v_readlane_b32 s22, v36, 57 +; SI-NEXT: v_readlane_b32 s23, v36, 59 +; SI-NEXT: s_mov_b32 s35, s20 +; SI-NEXT: s_mov_b32 s39, s21 +; SI-NEXT: v_readlane_b32 s81, v36, 36 +; SI-NEXT: v_readlane_b32 s20, v37, 57 +; SI-NEXT: v_readlane_b32 s92, v37, 55 +; SI-NEXT: v_readlane_b32 s37, v37, 54 +; SI-NEXT: v_readlane_b32 s51, v37, 52 +; SI-NEXT: v_readlane_b32 s24, v37, 41 +; SI-NEXT: v_mov_b32_e32 v2, s66 +; SI-NEXT: v_mov_b32_e32 v3, s55 +; SI-NEXT: v_mov_b32_e32 v4, s52 +; SI-NEXT: v_mov_b32_e32 v5, s48 +; SI-NEXT: v_mov_b32_e32 v7, s31 +; SI-NEXT: v_mov_b32_e32 v11, s89 +; SI-NEXT: v_mov_b32_e32 v13, s88 +; SI-NEXT: v_mov_b32_e32 v12, s87 +; SI-NEXT: v_mov_b32_e32 v14, s86 +; SI-NEXT: v_mov_b32_e32 v15, s69 +; SI-NEXT: v_mov_b32_e32 v17, s85 +; SI-NEXT: v_mov_b32_e32 v18, s84 +; SI-NEXT: v_mov_b32_e32 v19, s83 +; SI-NEXT: v_mov_b32_e32 v21, s82 +; SI-NEXT: v_mov_b32_e32 v22, s49 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_readlane_b32 s19, v36, 50 +; SI-NEXT: v_readlane_b32 s28, v36, 51 +; SI-NEXT: v_readlane_b32 s29, v36, 54 +; SI-NEXT: v_readlane_b32 s18, v36, 55 +; SI-NEXT: s_mov_b32 vcc_lo, s16 +; SI-NEXT: v_readlane_b32 s16, v37, 23 +; SI-NEXT: s_mov_b32 s70, s80 +; SI-NEXT: v_readlane_b32 s21, v37, 58 +; SI-NEXT: v_readlane_b32 s93, v37, 56 +; SI-NEXT: v_readlane_b32 s30, v36, 21 +; SI-NEXT: s_mov_b32 s37, s22 +; SI-NEXT: v_readlane_b32 s48, v36, 19 +; SI-NEXT: s_mov_b32 s51, s23 +; SI-NEXT: v_readlane_b32 s22, v36, 17 +; SI-NEXT: v_readlane_b32 s52, v37, 49 +; SI-NEXT: v_readlane_b32 s64, v36, 15 +; SI-NEXT: v_readlane_b32 s68, v37, 47 +; SI-NEXT: v_readlane_b32 s80, v36, 13 +; SI-NEXT: v_readlane_b32 s82, v37, 45 +; SI-NEXT: v_readlane_b32 s84, v36, 11 +; SI-NEXT: v_readlane_b32 s86, v37, 43 +; SI-NEXT: v_readlane_b32 s96, v36, 9 +; SI-NEXT: v_readlane_b32 s25, v37, 42 +; SI-NEXT: v_readlane_b32 s98, v36, 7 +; SI-NEXT: v_readlane_b32 s54, v36, 3 +; SI-NEXT: v_readlane_b32 s66, v36, 5 +; SI-NEXT: v_readlane_b32 s88, v36, 1 +; SI-NEXT: v_readlane_b32 s26, v36, 52 +; SI-NEXT: v_readlane_b32 s27, v36, 53 +; SI-NEXT: v_readlane_b32 s77, v36, 61 +; SI-NEXT: v_readlane_b32 s17, v37, 24 +; SI-NEXT: v_readlane_b32 s79, v36, 62 +; SI-NEXT: v_readlane_b32 s91, v36, 63 +; SI-NEXT: v_readlane_b32 s95, v35, 0 +; SI-NEXT: v_readlane_b32 s31, v36, 22 +; SI-NEXT: v_readlane_b32 s21, v35, 1 +; SI-NEXT: v_readlane_b32 s49, v36, 20 +; SI-NEXT: v_readlane_b32 s23, v36, 18 +; SI-NEXT: v_readlane_b32 s53, v37, 50 +; SI-NEXT: v_readlane_b32 s65, v36, 16 +; SI-NEXT: v_readlane_b32 s69, v37, 48 +; SI-NEXT: v_readlane_b32 s81, v36, 14 +; SI-NEXT: v_readlane_b32 s83, v37, 46 +; SI-NEXT: v_readlane_b32 s85, v36, 12 +; SI-NEXT: v_readlane_b32 s87, v37, 44 +; SI-NEXT: v_readlane_b32 s97, v36, 10 +; SI-NEXT: s_mov_b32 s25, s19 +; SI-NEXT: s_mov_b32 s93, s28 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s18 +; SI-NEXT: v_readlane_b32 s99, v36, 8 +; SI-NEXT: v_readlane_b32 s55, v36, 4 +; SI-NEXT: v_readlane_b32 s67, v36, 6 +; SI-NEXT: v_readlane_b32 s89, v36, 2 +; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s38, 0xff +; SI-NEXT: s_lshl_b32 s18, vcc_lo, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v33, s16 +; SI-NEXT: s_and_b32 s16, s75, 0xff +; SI-NEXT: s_lshl_b32 s17, s21, 8 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v26, v32, v26 +; SI-NEXT: v_or_b32_e32 v26, s16, v26 +; SI-NEXT: s_lshl_b32 s16, s34, 8 +; SI-NEXT: s_and_b32 s17, s72, 0xff +; SI-NEXT: v_readlane_b32 s18, v37, 31 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: s_lshl_b32 s18, s90, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v32, s16 +; SI-NEXT: s_and_b32 s16, s73, 0xff +; SI-NEXT: s_lshl_b32 s17, s95, 8 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v23, s16, v23 +; SI-NEXT: s_lshl_b32 s16, s94, 8 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s78, 0xff +; SI-NEXT: s_lshl_b32 s18, s76, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: s_and_b32 s16, s61, 0xff +; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_or_b32_e32 v20, s16, v20 +; SI-NEXT: s_lshl_b32 s16, s24, 8 +; SI-NEXT: s_and_b32 s17, s56, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s18, s82, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: s_and_b32 s16, s57, 0xff +; SI-NEXT: s_lshl_b32 s17, s79, 8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v16, s16, v16 +; SI-NEXT: s_lshl_b32 s16, s68, 8 +; SI-NEXT: s_and_b32 s17, s62, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s52, 0xff +; SI-NEXT: s_lshl_b32 s18, s50, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: s_and_b32 s16, s63, 0xff +; SI-NEXT: s_lshl_b32 s17, s77, 8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v13, v28, v13 +; SI-NEXT: v_or_b32_e32 v13, s16, v13 +; SI-NEXT: s_lshl_b32 s16, s36, 8 +; SI-NEXT: s_and_b32 s17, s58, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s92, 0xff +; SI-NEXT: s_lshl_b32 s18, s20, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, vcc_hi, 8 +; SI-NEXT: v_or_b32_e32 v11, v27, v11 +; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: s_and_b32 s16, s59, 0xff +; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v24 -; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: buffer_store_dword v27, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v24 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v26, vcc, 8, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v32, v26, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: v_readlane_b32 s19, v37, 32 +; SI-NEXT: v_or_b32_e32 v11, s16, v11 +; SI-NEXT: buffer_store_dword v31, v23, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v23, vcc, 20, v0 +; SI-NEXT: v_readlane_b32 s16, v37, 59 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s17, v37, 60 +; SI-NEXT: v_readlane_b32 s18, v37, 61 +; SI-NEXT: buffer_store_dword v30, v20, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: v_readlane_b32 s19, v37, 62 +; SI-NEXT: buffer_store_dword v16, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xff -; SI-NEXT: s_lshl_b32 s23, s54, 8 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v37, 63 +; SI-NEXT: buffer_store_dword v29, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s16, s47, 0xff +; SI-NEXT: s_lshl_b32 s17, s51, 8 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v21 -; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v10, s22, v10 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s62, 8 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: v_readlane_b32 s22, v62, 42 -; SI-NEXT: v_readlane_b32 s23, v62, 43 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: v_readlane_b32 s24, v62, 44 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s23, s24, 24 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v24 -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: v_or_b32_e32 v10, s16, v10 +; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_lshl_b32 s17, s88, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s54, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s66, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v24 -; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s39, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s20, s21, 0xff -; SI-NEXT: s_lshl_b32 s21, s52, 8 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s21, s96, 24 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v19 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v42 -; SI-NEXT: v_readlane_b32 s45, v62, 17 -; SI-NEXT: v_readlane_b32 s43, v62, 23 -; SI-NEXT: v_readlane_b32 s41, v62, 29 -; SI-NEXT: v_readlane_b32 s29, v62, 35 -; SI-NEXT: v_readlane_b32 s27, v62, 41 -; SI-NEXT: v_readlane_b32 s25, v62, 45 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v9, s20, v9 -; SI-NEXT: s_lshl_b32 s20, s58, 8 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: s_and_b32 s20, s98, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v24 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: s_lshl_b32 s19, s50, 8 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s18, v0 -; SI-NEXT: s_lshl_b32 s18, s38, 8 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_and_b32 s18, s36, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s34, 24 +; SI-NEXT: v_or_b32_e32 v9, s16, v9 +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s98, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s96, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s84, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v24 -; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s37, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v22 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v8, s16, v8 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s80, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s64, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s22, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x48, v24 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s35, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v0, s16, v0 -; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_readlane_b32 s19, v36, 0 +; SI-NEXT: v_or_b32_e32 v7, s16, v7 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s48, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_and_b32 s16, s30, 0xff +; SI-NEXT: v_readlane_b32 s18, v36, 23 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x50, v24 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_mov_b32_e32 v8, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_lshl_b32 s15, s29, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v19 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_or_b32_e32 v0, s14, v0 -; SI-NEXT: s_lshl_b32 s14, s90, 8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v6, s14, v6 +; SI-NEXT: v_readlane_b32 s14, v36, 25 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s15, v36, 26 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s88, 0xff +; SI-NEXT: v_readlane_b32 s14, v36, 27 +; SI-NEXT: v_readlane_b32 s15, v36, 28 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v36, 29 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x58, v24 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s65, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: s_lshl_b32 s13, s28, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v18 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v0, s12, v0 -; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_readlane_b32 s12, v36, 31 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s13, v36, 32 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s74, 0xff +; SI-NEXT: v_readlane_b32 s12, v36, 33 +; SI-NEXT: v_readlane_b32 s13, v36, 34 +; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s72, 24 +; SI-NEXT: s_lshl_b32 s13, s70, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x60, v24 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s55, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_lshl_b32 s11, s27, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v0, s10, v0 -; SI-NEXT: s_lshl_b32 s10, s60, 8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 +; SI-NEXT: v_readlane_b32 s10, v36, 37 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s11, v36, 38 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s56, 0xff +; SI-NEXT: v_readlane_b32 s10, v36, 39 +; SI-NEXT: v_readlane_b32 s11, v36, 40 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v36, 41 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x68, v24 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s53, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_lshl_b32 s9, s26, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v0, s8, v0 -; SI-NEXT: s_lshl_b32 s8, s86, 8 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 +; SI-NEXT: v_readlane_b32 s8, v36, 43 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s9, v36, 44 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s84, 0xff +; SI-NEXT: v_readlane_b32 s8, v37, 8 +; SI-NEXT: v_readlane_b32 s9, v37, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v37, 6 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s82, 24 +; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x70, v24 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s51, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: s_lshl_b32 s7, s93, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v0, s6, v0 -; SI-NEXT: v_readlane_b32 s6, v62, 46 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, s6, v2 +; SI-NEXT: v_readlane_b32 s6, v37, 2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s7, v37, 3 ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 47 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s80, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 48 +; SI-NEXT: v_readlane_b32 s6, v37, 4 +; SI-NEXT: v_readlane_b32 s7, v37, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v37, 0 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x78, v24 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s9, v62, 49 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v24 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s19, v36, 24 +; SI-NEXT: v_readlane_b32 s17, v36, 30 +; SI-NEXT: v_readlane_b32 s13, v36, 42 +; SI-NEXT: v_readlane_b32 s11, v37, 7 +; SI-NEXT: v_readlane_b32 s9, v37, 1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v34, 35 +; SI-NEXT: v_readlane_b32 s98, v34, 34 +; SI-NEXT: v_readlane_b32 s97, v34, 33 +; SI-NEXT: v_readlane_b32 s96, v34, 32 +; SI-NEXT: v_readlane_b32 s87, v34, 31 +; SI-NEXT: v_readlane_b32 s86, v34, 30 +; SI-NEXT: v_readlane_b32 s85, v34, 29 +; SI-NEXT: v_readlane_b32 s84, v34, 28 +; SI-NEXT: v_readlane_b32 s83, v34, 27 +; SI-NEXT: v_readlane_b32 s82, v34, 26 +; SI-NEXT: v_readlane_b32 s81, v34, 25 +; SI-NEXT: v_readlane_b32 s80, v34, 24 +; SI-NEXT: v_readlane_b32 s71, v34, 23 +; SI-NEXT: v_readlane_b32 s70, v34, 22 +; SI-NEXT: v_readlane_b32 s69, v34, 21 +; SI-NEXT: v_readlane_b32 s68, v34, 20 +; SI-NEXT: v_readlane_b32 s67, v34, 19 +; SI-NEXT: v_readlane_b32 s66, v34, 18 +; SI-NEXT: v_readlane_b32 s65, v34, 17 +; SI-NEXT: v_readlane_b32 s64, v34, 16 +; SI-NEXT: v_readlane_b32 s55, v34, 15 +; SI-NEXT: v_readlane_b32 s54, v34, 14 +; SI-NEXT: v_readlane_b32 s53, v34, 13 +; SI-NEXT: v_readlane_b32 s52, v34, 12 +; SI-NEXT: v_readlane_b32 s51, v34, 11 +; SI-NEXT: v_readlane_b32 s50, v34, 10 +; SI-NEXT: v_readlane_b32 s49, v34, 9 +; SI-NEXT: v_readlane_b32 s48, v34, 8 +; SI-NEXT: v_readlane_b32 s39, v34, 7 +; SI-NEXT: v_readlane_b32 s38, v34, 6 +; SI-NEXT: v_readlane_b32 s37, v34, 5 +; SI-NEXT: v_readlane_b32 s36, v34, 4 +; SI-NEXT: v_readlane_b32 s35, v34, 3 +; SI-NEXT: v_readlane_b32 s34, v34, 2 +; SI-NEXT: v_readlane_b32 s31, v34, 1 +; SI-NEXT: v_readlane_b32 s30, v34, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -213693,8 +209495,34 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -213711,1233 +209539,1120 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v30 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v28 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v26 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v26 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: ; kill: killed $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v41 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_alignbit_b32 v2, v2, v45, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_alignbit_b32 v2, v2, v43, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v1, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_alignbit_b32 v1, v1, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v63, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v61, v60, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v0, v61, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v58, v56, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v0, v58, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v47, v46, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v0, v47, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v44, v42, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v44, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v41, v55, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v0, v41, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v53, v52, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v53, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v51, v49, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v51, v0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v39, v38, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v39, v0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v37, v21, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v37, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v36, v23, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v36, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v35, v25, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_alignbit_b32 v0, v35, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v34, v27, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_alignbit_b32 v0, v34, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v29, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_alignbit_b32 v0, v33, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v32, v31, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_alignbit_b32 v0, v32, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB100_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v62 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v58 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_alignbit_b32 v7, v21, v7, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v54 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v52 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v48 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v34 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_alignbit_b32 v3, v9, v3, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v32, v31, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v0, v32, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v33, v29, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v34, v27, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v35, v25, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v36, v23, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v37, v21, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v39, v38, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v51, v49, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v53, v52, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v44, v42, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v47, v46, v15, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v58, v56, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v58, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v47, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v61, v60, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v41, v55, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v53, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v9, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v5, 16 +; SI-NEXT: v_alignbit_b32 v16, v61, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v3, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v63, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v4, v17, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: @@ -217242,43 +212957,52 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s29, 16 ; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 @@ -217307,6 +213031,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -217324,975 +213062,1468 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v23 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v19 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s14 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s9 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB101_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v53 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_lshr_b64 v[4:5], v[16:17], 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v19, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_mov_b32_e32 v11, v29 +; SI-NEXT: v_mov_b32_e32 v13, v24 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v14, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[21:22], v[35:36], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[33:34], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[42:43], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[25:26], 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshr_b64 v[37:38], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[28:29], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v24, v13 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v21 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v50 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v44, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshr_b64 v[47:48], v[35:36], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshr_b64 v[18:19], v[26:27], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[31:32], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v21 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[43:44], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v23, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v20, v42 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v36 +; SI-NEXT: v_mov_b32_e32 v5, v56 +; SI-NEXT: v_lshr_b64 v[28:29], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v56, v60 +; SI-NEXT: v_lshr_b64 v[35:36], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v57, v14 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v4, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_lshr_b64 v[53:54], v[3:4], 16 +; SI-NEXT: v_mov_b32_e32 v3, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[58:59], 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshr_b64 v[60:61], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[14:15], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[49:50], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[51:52], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[16:17], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v40, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[62:63], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v19 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v27, v22 +; SI-NEXT: s_branch .LBB101_3 +; SI-NEXT: .LBB101_2: +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mov_b32_e32 v55, v45 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v13, v9 +; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, v61 +; SI-NEXT: .LBB101_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB101_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v62 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[33:34], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[51:52], v[52:53], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v19 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s43 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s42 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s41 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s40 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB101_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v55 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v45 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[37:38], v[0:1], 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[22:23], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[10:11], v[35:36], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_lshr_b64 v[4:5], v[25:26], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_mov_b32_e32 v46, v13 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v11, v19 +; SI-NEXT: v_mov_b32_e32 v12, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v36 +; SI-NEXT: v_lshr_b64 v[35:36], v[42:43], 16 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v2 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v44, v35 +; SI-NEXT: v_lshr_b64 v[31:32], v[32:33], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v50 +; SI-NEXT: v_lshr_b64 v[49:50], v[49:50], 16 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[62:63], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v53 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[27:28], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v26 +; SI-NEXT: v_mov_b32_e32 v26, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v37 +; SI-NEXT: v_lshr_b64 v[27:28], v[34:35], 16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[40:41], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[0:1], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_cbranch_execnz .LBB101_3 -; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshr_b64 v[2:3], v[36:37], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[7:8], v[5:6], 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[36:37], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[9:10], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[13:14], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[38:39], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[45:46], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v20, v50 +; SI-NEXT: v_mov_b32_e32 v21, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 -; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 -; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 -; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 -; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; SI-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_or_b32_e32 v18, v51, v18 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v36 -; SI-NEXT: v_or_b32_e32 v19, v53, v19 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v31 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v38 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v39 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v20, v51, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: @@ -221088,583 +217319,414 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_mov_b32_e32 v53, v3 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v44 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 -; SI-NEXT: v_mov_b32_e32 v55, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v14 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v60 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -221689,626 +217751,641 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v53 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v58 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v11 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v62 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v27 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -222551,7 +218628,21 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-LABEL: bitcast_v64f16_to_v64bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -222568,1194 +218659,726 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v62 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v17, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB103_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v43 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: s_branch .LBB103_3 -; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v53, v15 -; SI-NEXT: v_mov_b32_e32 v56, v13 -; SI-NEXT: v_mov_b32_e32 v8, v39 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v14 -; SI-NEXT: v_mov_b32_e32 v47, v11 -; SI-NEXT: v_mov_b32_e32 v23, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v16 -; SI-NEXT: v_mov_b32_e32 v16, v9 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v6, v31 -; SI-NEXT: v_mov_b32_e32 v22, v29 -; SI-NEXT: v_mov_b32_e32 v26, v33 -; SI-NEXT: v_mov_b32_e32 v2, v49 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB103_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_mov_b32_e32 v19, v34 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: s_lshl_b32 s45, s6, 16 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 16 +; SI-NEXT: s_lshl_b32 s56, s18, 16 +; SI-NEXT: s_lshl_b32 s57, s8, 16 +; SI-NEXT: s_lshl_b32 s58, s19, 16 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_lshl_b32 s60, s20, 16 +; SI-NEXT: s_lshl_b32 s61, s10, 16 +; SI-NEXT: s_lshl_b32 s62, s21, 16 +; SI-NEXT: s_lshl_b32 s63, s11, 16 +; SI-NEXT: s_lshl_b32 s72, s22, 16 +; SI-NEXT: s_lshl_b32 s73, s12, 16 +; SI-NEXT: s_lshl_b32 s74, s23, 16 +; SI-NEXT: s_lshl_b32 s75, s13, 16 +; SI-NEXT: s_lshl_b32 s76, s24, 16 +; SI-NEXT: s_lshl_b32 s77, s14, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 16 +; SI-NEXT: s_lshl_b32 s79, s15, 16 +; SI-NEXT: s_lshl_b32 s88, s26, 16 +; SI-NEXT: s_lshl_b32 s89, s40, 16 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_lshl_b32 s91, s41, 16 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_lshl_b32 s93, s42, 16 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_lshl_b32 s95, s43, 16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mov_b32_e32 v27, v32 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v32 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v35 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v36 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v43 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v44, v45 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v14 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_mov_b32_e32 v54, v30 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB103_4 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v52 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v42 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v53 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v55 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v56 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v47 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v61 -; SI-NEXT: v_mov_b32_e32 v18, v53 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v61 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB103_5 +; SI-NEXT: .LBB103_3: +; SI-NEXT: v_mov_b32_e32 v54, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_mov_b32_e32 v27, v32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v44, v45 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v22, v36 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_mov_b32_e32 v19, v34 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_branch .LBB103_2 +; SI-NEXT: .LBB103_4: +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, s95 +; SI-NEXT: v_mov_b32_e32 v26, s94 +; SI-NEXT: v_mov_b32_e32 v25, s93 +; SI-NEXT: v_mov_b32_e32 v24, s92 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, s91 +; SI-NEXT: v_mov_b32_e32 v22, s90 +; SI-NEXT: v_mov_b32_e32 v21, s89 +; SI-NEXT: v_mov_b32_e32 v20, s88 +; SI-NEXT: v_mov_b32_e32 v19, s79 +; SI-NEXT: v_mov_b32_e32 v18, s78 +; SI-NEXT: v_mov_b32_e32 v17, s77 +; SI-NEXT: v_mov_b32_e32 v63, v47 +; SI-NEXT: v_mov_b32_e32 v47, s76 +; SI-NEXT: v_mov_b32_e32 v15, s75 +; SI-NEXT: v_mov_b32_e32 v14, s74 +; SI-NEXT: v_mov_b32_e32 v13, s73 +; SI-NEXT: v_mov_b32_e32 v12, s72 +; SI-NEXT: v_mov_b32_e32 v11, s63 +; SI-NEXT: v_mov_b32_e32 v10, s62 +; SI-NEXT: v_mov_b32_e32 v9, s61 +; SI-NEXT: v_mov_b32_e32 v57, s60 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v45, s59 +; SI-NEXT: v_mov_b32_e32 v6, s58 +; SI-NEXT: v_mov_b32_e32 v5, s57 +; SI-NEXT: v_mov_b32_e32 v4, s56 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v1, s45 +; SI-NEXT: v_mov_b32_e32 v0, s44 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v36 +; SI-NEXT: v_mov_b32_e32 v7, v38 +; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: v_lshr_b64 v[45:46], v[46:47], 16 -; SI-NEXT: v_lshr_b64 v[46:47], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v4 ; SI-NEXT: v_lshr_b64 v[4:5], v[5:6], 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[33:34], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[37:38], 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshr_b64 v[36:37], v[36:37], 16 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[7:8], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[24:25], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[28:29], 16 ; SI-NEXT: v_lshr_b64 v[15:16], v[30:31], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[56:57], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[58:59], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[60:61], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[62:63], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[41:42], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v0, v45 -; SI-NEXT: v_mov_b32_e32 v1, v46 +; SI-NEXT: v_lshr_b64 v[17:18], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[49:50], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[57:58], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[60:61], 16 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -223772,7 +219395,26 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[30:31], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v3, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: @@ -233338,13 +228980,8 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v50, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -233361,562 +228998,409 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v62 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v41 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v63 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v61, v55 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v37 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v31, v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; SI-NEXT: v_or_b32_e32 v29, v29, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v50 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v9, v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v35 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_or_b32_e32 v7, v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_or_b32_e32 v5, v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_or_b32_e32 v3, v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v50, v37, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v61 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v35 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v41, v41, v61 -; SI-NEXT: v_or_b32_e32 v55, v55, v60 -; SI-NEXT: v_or_b32_e32 v53, v53, v59 -; SI-NEXT: v_or_b32_e32 v52, v52, v58 -; SI-NEXT: v_or_b32_e32 v51, v51, v57 -; SI-NEXT: v_or_b32_e32 v49, v49, v56 -; SI-NEXT: v_or_b32_e32 v48, v48, v47 -; SI-NEXT: v_or_b32_e32 v39, v39, v46 -; SI-NEXT: v_or_b32_e32 v38, v38, v45 -; SI-NEXT: v_or_b32_e32 v36, v36, v43 -; SI-NEXT: v_or_b32_e32 v34, v34, v42 -; SI-NEXT: v_or_b32_e32 v35, v35, v54 -; SI-NEXT: v_or_b32_e32 v33, v33, v40 -; SI-NEXT: v_alignbit_b32 v63, v1, v0, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v42 +; SI-NEXT: v_or_b32_e32 v22, v22, v55 +; SI-NEXT: v_or_b32_e32 v24, v24, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v49 +; SI-NEXT: v_or_b32_e32 v28, v28, v38 +; SI-NEXT: v_alignbit_b32 v63, v1, v35, 16 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_alignbit_b32 v62, v3, v62, 16 +; SI-NEXT: v_alignbit_b32 v61, v5, v61, 16 ; SI-NEXT: v_alignbit_b32 v60, v7, v60, 16 ; SI-NEXT: v_alignbit_b32 v59, v9, v59, 16 ; SI-NEXT: v_alignbit_b32 v58, v11, v58, 16 ; SI-NEXT: v_alignbit_b32 v57, v13, v57, 16 ; SI-NEXT: v_alignbit_b32 v56, v15, v56, 16 ; SI-NEXT: v_alignbit_b32 v47, v17, v47, 16 -; SI-NEXT: v_alignbit_b32 v46, v19, v46, 16 -; SI-NEXT: v_alignbit_b32 v45, v21, v45, 16 -; SI-NEXT: v_alignbit_b32 v43, v25, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v54, v29, v54, 16 -; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_or_b32_e32 v62, v62, v37 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v19, v45, 16 +; SI-NEXT: v_alignbit_b32 v42, v21, v42, 16 +; SI-NEXT: v_alignbit_b32 v55, v23, v55, 16 +; SI-NEXT: v_alignbit_b32 v52, v25, v52, 16 +; SI-NEXT: v_alignbit_b32 v49, v27, v49, 16 +; SI-NEXT: v_alignbit_b32 v38, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v36, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_or_b32_e32 v62, v62, v44 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v62, v3, v61, 16 -; SI-NEXT: v_alignbit_b32 v61, v5, v37, 16 -; SI-NEXT: v_alignbit_b32 v44, v23, v44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v62 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v54 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v32 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v43 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v40 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v26, v26, v34 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v24, v24, v36 -; SI-NEXT: v_or_b32_e32 v28, v28, v34 -; SI-NEXT: v_or_b32_e32 v30, v30, v33 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v60 -; SI-NEXT: v_or_b32_e32 v6, v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v59 -; SI-NEXT: v_or_b32_e32 v8, v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v57 -; SI-NEXT: v_or_b32_e32 v12, v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v47 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v45 -; SI-NEXT: v_or_b32_e32 v20, v20, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -233933,10 +229417,43 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v21, v32 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v37 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16: @@ -234177,83 +229694,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -234270,652 +229714,613 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v17 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_mov_b32_e32 v42, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s20 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v39 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b32 s42, s16, 16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB109_2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB109_3 -; SI-NEXT: .LBB109_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: .LBB109_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v61 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: s_cbranch_vccnz .LBB109_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v52 -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: s_cbranch_execnz .LBB109_4 +; SI-NEXT: .LBB109_2: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v40, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_or_b32_e32 v46, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_or_b32_e32 v26, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v54, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_or_b32_e32 v44, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v38, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_mov_b32_e32 v58, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v29, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v42 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v36, v39, v2 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v45, v48, v4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v40, v39, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v58 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v31, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v48, v11, v12 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_mov_b32_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v24, v1, v2 +; SI-NEXT: v_mov_b32_e32 v62, v24 +; SI-NEXT: v_mov_b32_e32 v61, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v36, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: v_or_b32_e32 v10, v19, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_or_b32_e32 v42, v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v15, v26 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_or_b32_e32 v60, v17, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v13, v18, v45 +; SI-NEXT: v_or_b32_e32 v22, v28, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_or_b32_e32 v12, v27, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v39, v26, v37 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_or_b32_e32 v47, v27, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_mov_b32_e32 v18, v30 +; SI-NEXT: v_mov_b32_e32 v19, v31 +; SI-NEXT: v_or_b32_e32 v23, v25, v32 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[61:62], 16 +; SI-NEXT: v_lshr_b64 v[61:62], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v26, v21 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshr_b64 v[62:63], v[0:1], 16 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshr_b64 v[56:57], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshr_b64 v[60:61], v[2:3], 16 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v39, v48, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_or_b32_e32 v2, v24, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v38, v38, v6 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v23, v5 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v39, v39, v14 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v2, v27, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v48, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v49 -; SI-NEXT: v_or_b32_e32 v38, v38, v12 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_or_b32_e32 v34, v39, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v55, v48, v22 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v52, v38, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_lshr_b64 v[58:59], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v36 -; SI-NEXT: v_or_b32_e32 v36, v38, v24 -; SI-NEXT: v_or_b32_e32 v38, v39, v26 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v48, v28 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v44, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v4, v28, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v49, v52 +; SI-NEXT: v_mov_b32_e32 v51, v55 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v44 +; SI-NEXT: v_lshr_b64 v[55:56], v[32:33], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[56:57], v[5:6], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v49, v30 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[35:36], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 +; SI-NEXT: v_or_b32_e32 v23, v27, v35 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[35:36], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v18, v33 -; SI-NEXT: v_mov_b32_e32 v33, v50 -; SI-NEXT: v_lshr_b64 v[50:51], v[20:21], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_lshr_b64 v[51:52], v[26:27], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[51:52], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[10:11], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[48:49], v[22:23], 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[38:39], v[24:25], 16 +; SI-NEXT: v_or_b32_e32 v23, v30, v41 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[27:28], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[45:46], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v40, v59 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, v15 +; SI-NEXT: v_mov_b32_e32 v23, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[23:24], 16 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v7, v12 +; SI-NEXT: v_mov_b32_e32 v15, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_lshr_b64 v[16:17], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[37:38], 16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v58 +; SI-NEXT: v_lshr_b64 v[57:58], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[59:60], 16 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v46, v32 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: s_branch .LBB109_5 +; SI-NEXT: .LBB109_3: +; SI-NEXT: s_branch .LBB109_2 +; SI-NEXT: .LBB109_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, s8 +; SI-NEXT: v_mov_b32_e32 v43, s10 +; SI-NEXT: v_mov_b32_e32 v53, s12 +; SI-NEXT: v_mov_b32_e32 v31, s14 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v52, s19 +; SI-NEXT: v_mov_b32_e32 v50, s21 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v54, s25 +; SI-NEXT: v_mov_b32_e32 v38, s29 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v39, s28 +; SI-NEXT: v_mov_b32_e32 v47, v28 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v58, v41 +; SI-NEXT: v_mov_b32_e32 v62, v22 +; SI-NEXT: v_mov_b32_e32 v41, v7 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v44, v12 +; SI-NEXT: v_mov_b32_e32 v26, v11 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[51:52], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v55, v17 +; SI-NEXT: v_mov_b32_e32 v25, v16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v1, s42 +; SI-NEXT: v_mov_b32_e32 v15, v27 +; SI-NEXT: v_mov_b32_e32 v27, s40 +; SI-NEXT: v_mov_b32_e32 v30, s13 +; SI-NEXT: v_mov_b32_e32 v14, s11 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v40, v33 +; SI-NEXT: v_mov_b32_e32 v21, v32 ; SI-NEXT: .LBB109_5: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v40 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v51, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v5, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; SI-NEXT: v_or_b32_e32 v34, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v39, v1, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v28, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -234932,8 +230337,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v31, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v3, v51 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: @@ -235249,6 +230658,76 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -235265,949 +230744,867 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v61, v1, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v62, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v59, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v61, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v56, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v1, v47, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v46, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v62, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v59, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v57, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v42, v1, v48 +; SI-NEXT: v_alignbit_b32 v1, v56, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v41, v1, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v46, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v55, v1, v39 +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v53, v1, v4 +; SI-NEXT: v_alignbit_b32 v1, v43, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v4, v1, v6 +; SI-NEXT: v_alignbit_b32 v1, v42, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v3, v1, v8 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v50, v1, v31 +; SI-NEXT: v_alignbit_b32 v1, v55, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v53, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v4, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v50, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v61, v3, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v5, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v62, v7, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v59, v9, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v11, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v56, v10, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v46, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v53, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v6, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v2, v29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v50, v28, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64f16: @@ -236447,746 +231844,812 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-LABEL: bitcast_v64i16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_writelane_b32 v32, s69, 21 +; SI-NEXT: v_writelane_b32 v32, s70, 22 +; SI-NEXT: v_writelane_b32 v32, s71, 23 +; SI-NEXT: v_writelane_b32 v32, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v32, s81, 25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v33, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_writelane_b32 v32, s82, 26 +; SI-NEXT: v_writelane_b32 v33, s4, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_writelane_b32 v32, s83, 27 +; SI-NEXT: v_writelane_b32 v33, s4, 2 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v32, s84, 28 +; SI-NEXT: v_writelane_b32 v33, s4, 3 +; SI-NEXT: v_writelane_b32 v32, s85, 29 +; SI-NEXT: v_writelane_b32 v33, s29, 4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v32, s86, 30 +; SI-NEXT: v_writelane_b32 v33, s4, 5 +; SI-NEXT: v_writelane_b32 v32, s87, 31 +; SI-NEXT: v_writelane_b32 v33, s27, 6 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_writelane_b32 v32, s96, 32 +; SI-NEXT: v_writelane_b32 v33, s4, 7 +; SI-NEXT: v_writelane_b32 v32, s97, 33 +; SI-NEXT: v_writelane_b32 v33, s25, 8 +; SI-NEXT: v_writelane_b32 v32, s98, 34 +; SI-NEXT: v_writelane_b32 v33, s23, 9 +; SI-NEXT: v_writelane_b32 v32, s99, 35 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: s_lshr_b32 s48, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s67, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s24, 16 +; SI-NEXT: s_lshr_b32 s65, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s55, s21, 16 +; SI-NEXT: s_lshr_b32 s95, s20, 16 +; SI-NEXT: v_writelane_b32 v33, s19, 10 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s93, s18, 16 +; SI-NEXT: s_lshr_b32 s51, s17, 16 +; SI-NEXT: s_lshr_b32 s99, s16, 16 +; SI-NEXT: v_writelane_b32 v33, s17, 11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_writelane_b32 v33, s16, 12 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_mov_b32 s56, s18 +; SI-NEXT: v_writelane_b32 v33, s4, 13 +; SI-NEXT: s_mov_b32 s58, s22 +; SI-NEXT: v_writelane_b32 v33, s56, 14 +; SI-NEXT: s_mov_b32 s59, s24 +; SI-NEXT: v_writelane_b32 v33, s58, 15 +; SI-NEXT: s_mov_b32 s97, s26 +; SI-NEXT: v_writelane_b32 v33, s59, 16 +; SI-NEXT: s_mov_b32 s85, s28 +; SI-NEXT: v_writelane_b32 v33, s97, 17 +; SI-NEXT: s_mov_b32 s57, s20 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_readfirstlane_b32 s36, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_writelane_b32 v33, s85, 18 +; SI-NEXT: v_readfirstlane_b32 s81, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s89, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_readfirstlane_b32 s37, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s39, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s38, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s30, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s91, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s46, v15 +; SI-NEXT: v_writelane_b32 v33, s57, 19 +; SI-NEXT: s_mov_b32 s88, s21 +; SI-NEXT: v_readfirstlane_b32 s94, v0 +; SI-NEXT: v_readfirstlane_b32 s96, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v17 +; SI-NEXT: v_readfirstlane_b32 s87, v16 +; SI-NEXT: v_readfirstlane_b32 s98, v14 +; SI-NEXT: v_readfirstlane_b32 s92, v13 +; SI-NEXT: v_readfirstlane_b32 s83, v12 +; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_readfirstlane_b32 s82, v10 +; SI-NEXT: v_readfirstlane_b32 s68, v9 +; SI-NEXT: v_readfirstlane_b32 s80, v8 +; SI-NEXT: v_readfirstlane_b32 s29, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s49, v4 +; SI-NEXT: v_readfirstlane_b32 s77, v18 +; SI-NEXT: v_readfirstlane_b32 s84, v3 +; SI-NEXT: v_readfirstlane_b32 s27, v2 +; SI-NEXT: v_writelane_b32 v33, s46, 20 +; SI-NEXT: v_writelane_b32 v33, s47, 21 ; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_mov_b32_e32 v48, v23 -; SI-NEXT: v_mov_b32_e32 v49, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v55, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v40, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v32 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v41, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v42, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 -; SI-NEXT: v_mov_b32_e32 v43, v59 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_mov_b32_e32 v44, v63 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_mov_b32_e32 v46, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_readlane_b32 s5, v33, 11 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: v_readlane_b32 s4, v33, 7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 13 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: v_readlane_b32 s4, v33, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s44, s84, 16 +; SI-NEXT: v_readlane_b32 s16, v33, 2 +; SI-NEXT: v_writelane_b32 v33, s39, 34 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s30, 0xffff +; SI-NEXT: s_lshl_b32 s45, s49, 16 +; SI-NEXT: v_writelane_b32 v33, s38, 35 +; SI-NEXT: s_or_b32 vcc_hi, s44, s45 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s45, s71, 16 +; SI-NEXT: v_readlane_b32 s16, v33, 1 +; SI-NEXT: s_or_b32 s39, s44, s45 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s45, s80, 16 +; SI-NEXT: v_writelane_b32 v33, s49, 36 +; SI-NEXT: v_writelane_b32 v33, s48, 37 +; SI-NEXT: s_or_b32 s49, s44, s45 +; SI-NEXT: s_and_b32 s44, s90, 0xffff +; SI-NEXT: s_lshl_b32 s45, s82, 16 +; SI-NEXT: v_writelane_b32 v33, s29, 38 +; SI-NEXT: s_mov_b32 s69, s51 +; SI-NEXT: s_or_b32 s51, s44, s45 +; SI-NEXT: s_and_b32 s44, s70, 0xffff +; SI-NEXT: s_lshl_b32 s45, s83, 16 +; SI-NEXT: v_writelane_b32 v33, s27, 39 +; SI-NEXT: s_mov_b32 s34, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s53 +; SI-NEXT: s_or_b32 s53, s44, s45 +; SI-NEXT: s_and_b32 s44, s89, 0xffff +; SI-NEXT: s_lshl_b32 s45, s98, 16 +; SI-NEXT: s_mov_b32 s86, s71 +; SI-NEXT: s_mov_b32 s71, s55 +; SI-NEXT: s_or_b32 s55, s44, s45 +; SI-NEXT: s_and_b32 s44, s79, 0xffff +; SI-NEXT: s_lshl_b32 s45, s87, 16 +; SI-NEXT: v_writelane_b32 v33, s92, 40 +; SI-NEXT: s_lshl_b32 s38, s17, 16 +; SI-NEXT: s_lshl_b32 s54, s92, 16 +; SI-NEXT: v_writelane_b32 v33, s25, 41 +; SI-NEXT: s_mov_b32 s92, s17 +; SI-NEXT: s_mov_b32 s17, s79 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s89, s87 +; SI-NEXT: s_mov_b32 s87, s30 +; SI-NEXT: s_mov_b32 s30, s80 +; SI-NEXT: s_mov_b32 s80, s65 +; SI-NEXT: s_or_b32 s65, s44, s45 +; SI-NEXT: s_and_b32 s44, s36, 0xffff +; SI-NEXT: s_lshl_b32 s45, s96, 16 +; SI-NEXT: s_mov_b32 s16, s36 +; SI-NEXT: s_mov_b32 s36, s96 +; SI-NEXT: s_mov_b32 s96, s82 +; SI-NEXT: s_mov_b32 s82, s76 +; SI-NEXT: s_mov_b32 s76, s67 +; SI-NEXT: s_or_b32 s67, s44, s45 +; SI-NEXT: v_readlane_b32 s44, v33, 12 +; SI-NEXT: s_lshl_b32 s42, s99, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s62, s44, s42 +; SI-NEXT: s_lshr_b64 s[18:19], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s56, 0xffff +; SI-NEXT: s_lshl_b32 s14, s95, 16 +; SI-NEXT: s_or_b32 s74, s42, s40 +; SI-NEXT: s_lshr_b64 s[20:21], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s57, 0xffff +; SI-NEXT: s_lshl_b32 s12, s31, 16 +; SI-NEXT: s_or_b32 s72, s40, s14 +; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s58, 0xffff +; SI-NEXT: s_lshl_b32 s10, s35, 16 +; SI-NEXT: s_lshl_b32 s52, s25, 16 +; SI-NEXT: s_or_b32 s60, s14, s12 +; SI-NEXT: s_lshr_b64 s[24:25], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s59, 0xffff +; SI-NEXT: s_lshl_b32 s6, s4, 16 +; SI-NEXT: s_lshl_b32 s4, s27, 16 +; SI-NEXT: s_or_b32 s58, s12, s10 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s97, 0xffff +; SI-NEXT: s_lshl_b32 s48, s29, 16 +; SI-NEXT: s_or_b32 s56, s10, s8 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s85, 0xffff +; SI-NEXT: s_lshl_b32 s64, s46, 16 +; SI-NEXT: s_lshl_b32 s66, s47, 16 +; SI-NEXT: s_or_b32 s46, s8, s6 +; SI-NEXT: s_mov_b32 s47, s7 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v33, s6, 22 +; SI-NEXT: v_writelane_b32 v33, s7, 23 +; SI-NEXT: s_and_b32 s6, s94, 0xffff +; SI-NEXT: s_or_b32 s44, s6, s4 +; SI-NEXT: s_mov_b32 s45, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v33, s4, 24 +; SI-NEXT: s_lshl_b32 vcc_lo, s77, 16 +; SI-NEXT: v_writelane_b32 v33, s5, 25 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_or_b32 s42, s4, vcc_lo +; SI-NEXT: s_lshr_b64 s[4:5], vcc, 16 +; SI-NEXT: v_writelane_b32 v33, s4, 26 +; SI-NEXT: v_writelane_b32 v33, s5, 27 +; SI-NEXT: v_readlane_b32 s4, v33, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s40, s4, s38 +; SI-NEXT: s_lshr_b64 s[4:5], s[38:39], 16 +; SI-NEXT: v_writelane_b32 v33, s4, 28 +; SI-NEXT: v_writelane_b32 v33, s5, 29 +; SI-NEXT: v_readlane_b32 s38, v33, 35 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_or_b32 s14, s4, s48 +; SI-NEXT: s_lshr_b64 s[4:5], s[48:49], 16 +; SI-NEXT: s_mov_b32 s75, s41 +; SI-NEXT: s_mov_b32 s41, s39 +; SI-NEXT: v_readlane_b32 s39, v33, 34 +; SI-NEXT: v_writelane_b32 v33, s4, 30 +; SI-NEXT: v_writelane_b32 v33, s5, 31 +; SI-NEXT: v_readlane_b32 s4, v33, 0 +; SI-NEXT: s_lshl_b32 s50, s68, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s12, s4, s50 +; SI-NEXT: s_lshr_b64 s[4:5], s[50:51], 16 +; SI-NEXT: s_mov_b32 s73, s15 +; SI-NEXT: s_mov_b32 s15, s49 +; SI-NEXT: v_readlane_b32 s48, v33, 37 +; SI-NEXT: v_readlane_b32 s49, v33, 36 +; SI-NEXT: v_writelane_b32 v33, s4, 32 +; SI-NEXT: v_writelane_b32 v33, s5, 33 +; SI-NEXT: s_and_b32 s4, s39, 0xffff +; SI-NEXT: s_or_b32 s10, s4, s52 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_or_b32 s8, s4, s54 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s64 +; SI-NEXT: s_and_b32 s4, s78, 0xffff +; SI-NEXT: s_mov_b32 s59, s11 +; SI-NEXT: s_mov_b32 s57, s9 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_mov_b32 s7, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 +; SI-NEXT: s_or_b32 s4, s4, s66 +; SI-NEXT: s_mov_b32 s5, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 +; SI-NEXT: s_mov_b32 s63, s43 +; SI-NEXT: s_mov_b32 s61, s13 +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_mov_b32 s51, s69 +; SI-NEXT: v_readlane_b32 s29, v33, 38 +; SI-NEXT: s_mov_b32 s53, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s34 +; SI-NEXT: v_readlane_b32 s27, v33, 39 +; SI-NEXT: s_mov_b32 s55, s71 +; SI-NEXT: s_mov_b32 s71, s86 +; SI-NEXT: s_mov_b32 s65, s80 +; SI-NEXT: s_mov_b32 s80, s30 +; SI-NEXT: s_mov_b32 s30, s87 +; SI-NEXT: s_mov_b32 s87, s89 +; SI-NEXT: s_mov_b32 s89, s79 +; SI-NEXT: s_mov_b32 s79, s17 +; SI-NEXT: s_mov_b32 s17, s92 +; SI-NEXT: v_readlane_b32 s25, v33, 41 +; SI-NEXT: v_readlane_b32 s92, v33, 40 +; SI-NEXT: s_mov_b32 s67, s76 +; SI-NEXT: s_mov_b32 s76, s82 +; SI-NEXT: s_mov_b32 s82, s96 +; SI-NEXT: s_mov_b32 s96, s36 +; SI-NEXT: s_mov_b32 s36, s16 ; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v37 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v48 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s9 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s4, s78, 3 +; SI-NEXT: v_readlane_b32 s5, v33, 21 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s36, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s96, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s81, 3 +; SI-NEXT: v_readlane_b32 s7, v33, 20 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s7, s79, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s87, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s37, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s9, s92, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s89, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s98, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s10, s39, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s70, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s83, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v33, 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v39 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s13, s68, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s90, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s82, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s14, s38, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s15, s29, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v33, 1 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s80, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v33, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s71, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s91, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s77, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s30, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s94, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s27, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 13 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 18 +; SI-NEXT: s_add_i32 s28, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v33, 5 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 4 +; SI-NEXT: s_add_i32 s29, s16, 3 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 17 +; SI-NEXT: s_add_i32 s26, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v33, 7 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 6 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 16 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 8 +; SI-NEXT: s_add_i32 s25, s16, 3 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 15 +; SI-NEXT: s_add_i32 s22, s16, 3 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s31, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 9 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 19 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: s_and_b32 s16, s20, 0xffff +; SI-NEXT: s_lshl_b32 s17, s95, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s21, s88, 3 +; SI-NEXT: s_add_i32 s72, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 14 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s93, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 10 +; SI-NEXT: s_add_i32 s19, s16, 3 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s75, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 12 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s99, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 22 +; SI-NEXT: v_writelane_b32 v33, s17, 23 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 24 +; SI-NEXT: v_writelane_b32 v33, s17, 25 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 26 +; SI-NEXT: v_writelane_b32 v33, s17, 27 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v33, s16, 28 +; SI-NEXT: v_writelane_b32 v33, s17, 29 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: v_writelane_b32 v33, s16, 30 +; SI-NEXT: v_writelane_b32 v33, s17, 31 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[56:57], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 32 +; SI-NEXT: s_lshr_b32 s51, s63, 16 +; SI-NEXT: s_lshr_b32 s53, s75, 16 +; SI-NEXT: s_lshr_b32 s55, s73, 16 +; SI-NEXT: s_lshr_b32 s65, s61, 16 +; SI-NEXT: s_lshr_b32 s67, s59, 16 +; SI-NEXT: s_lshr_b32 s76, s57, 16 +; SI-NEXT: s_lshr_b32 s48, s47, 16 +; SI-NEXT: s_lshr_b32 s84, s45, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s82, s13, 16 +; SI-NEXT: s_lshr_b32 s83, s11, 16 +; SI-NEXT: s_lshr_b32 s98, s9, 16 +; SI-NEXT: s_lshr_b32 s87, s7, 16 +; SI-NEXT: s_lshr_b32 s96, s5, 16 +; SI-NEXT: v_writelane_b32 v33, s17, 33 ; SI-NEXT: .LBB111_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v57 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v35, v19 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v20, v34, v20 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v45 -; SI-NEXT: v_or_b32_e32 v23, v35, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v58 -; SI-NEXT: v_or_b32_e32 v25, v35, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v62 -; SI-NEXT: v_or_b32_e32 v27, v35, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v63 -; SI-NEXT: v_or_b32_e32 v29, v35, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v46 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v59 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_or_b32_e32 v31, v34, v31 +; SI-NEXT: s_and_b32 s16, s62, 0xffff +; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s63, 0xffff +; SI-NEXT: s_lshl_b32 s18, s51, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s74, 0xffff +; SI-NEXT: s_lshl_b32 s19, s20, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s75, 0xffff +; SI-NEXT: s_lshl_b32 s20, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s72, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s73, 0xffff +; SI-NEXT: s_lshl_b32 s22, s55, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s60, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s61, 0xffff +; SI-NEXT: s_lshl_b32 s24, s65, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s58, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s59, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s56, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xffff +; SI-NEXT: s_lshl_b32 s28, s76, 16 +; SI-NEXT: v_readlane_b32 s56, v33, 22 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s56, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 25 +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s84, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 26 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 27 +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 28 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 29 +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 30 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 31 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s80, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 32 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s82, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s83, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s98, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s6, s6, s46 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s46, s87, 16 +; SI-NEXT: s_or_b32 s7, s7, s46 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s46, s66, 16 +; SI-NEXT: s_or_b32 s4, s4, s46 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s46, s96, 16 +; SI-NEXT: s_or_b32 s5, s5, s46 +; SI-NEXT: v_readlane_b32 s57, v33, 23 +; SI-NEXT: v_readlane_b32 s47, v33, 33 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s44 +; SI-NEXT: v_mov_b32_e32 v15, s45 +; SI-NEXT: v_mov_b32_e32 v16, s42 +; SI-NEXT: v_mov_b32_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v18, s40 +; SI-NEXT: v_mov_b32_e32 v19, s41 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s99, v32, 35 +; SI-NEXT: v_readlane_b32 s98, v32, 34 +; SI-NEXT: v_readlane_b32 s97, v32, 33 +; SI-NEXT: v_readlane_b32 s96, v32, 32 +; SI-NEXT: v_readlane_b32 s87, v32, 31 +; SI-NEXT: v_readlane_b32 s86, v32, 30 +; SI-NEXT: v_readlane_b32 s85, v32, 29 +; SI-NEXT: v_readlane_b32 s84, v32, 28 +; SI-NEXT: v_readlane_b32 s83, v32, 27 +; SI-NEXT: v_readlane_b32 s82, v32, 26 +; SI-NEXT: v_readlane_b32 s81, v32, 25 +; SI-NEXT: v_readlane_b32 s80, v32, 24 +; SI-NEXT: v_readlane_b32 s71, v32, 23 +; SI-NEXT: v_readlane_b32 s70, v32, 22 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB111_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v44, v63 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v43, v59 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v42, v38 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v41, v36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v40, v32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v55, v31 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v49, v24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v48, v23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 22 +; SI-NEXT: v_writelane_b32 v33, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: v_writelane_b32 v33, s4, 24 +; SI-NEXT: v_writelane_b32 v33, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 26 +; SI-NEXT: v_writelane_b32 v33, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 28 +; SI-NEXT: v_writelane_b32 v33, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 30 +; SI-NEXT: v_writelane_b32 v33, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 32 +; SI-NEXT: v_writelane_b32 v33, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB111_2 ; ; VI-LABEL: bitcast_v64i16_to_v64f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 7351cff50f25f..67fb9a9e56a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1634,72 +1634,42 @@ define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8f16: @@ -1774,63 +1744,43 @@ define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v4i32_to_v8f16_scalar: @@ -1917,26 +1867,14 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1949,29 +1887,33 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1980,15 +1922,15 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2083,43 +2025,31 @@ define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v4i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2127,32 +2057,37 @@ define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4i32_scalar: ; VI: ; %bb.0: @@ -6279,72 +6214,42 @@ define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16: @@ -6415,66 +6320,52 @@ define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16_scalar: ; VI: ; %bb.0: @@ -6562,26 +6453,14 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6594,29 +6473,33 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6625,15 +6508,15 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -6728,43 +6611,31 @@ define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 ; SI-LABEL: bitcast_v8f16_to_v4f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6772,32 +6643,37 @@ define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4f32_scalar: ; VI: ; %bb.0: @@ -10583,72 +10459,42 @@ define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8f16: @@ -10724,63 +10570,43 @@ define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i64_to_v8f16_scalar: @@ -10867,26 +10693,14 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10899,29 +10713,33 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10930,15 +10748,15 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11033,43 +10851,31 @@ define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v2i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -11077,32 +10883,37 @@ define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2i64_scalar: ; VI: ; %bb.0: @@ -14490,68 +14301,40 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16: @@ -14621,64 +14404,50 @@ define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16_scalar: ; VI: ; %bb.0: @@ -14760,26 +14529,14 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -14792,29 +14549,33 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14823,15 +14584,15 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -14926,43 +14687,31 @@ define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i3 ; SI-LABEL: bitcast_v8f16_to_v2f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14970,32 +14719,37 @@ define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2f64_scalar: ; VI: ; %bb.0: @@ -17868,77 +17622,82 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v8, v1, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v1, v15 +; SI-NEXT: v_or_b32_e32 v9, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v10, v8, v12, 16 +; SI-NEXT: v_alignbit_b32 v11, v4, v14, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v10, v8, v9, 16 +; SI-NEXT: v_alignbit_b32 v11, v4, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8f16: @@ -18020,66 +17779,78 @@ define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v8i16_to_v8f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s22, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s21, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s24, s21, 16 +; SI-NEXT: s_or_b32 s25, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s4, s24 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s26, s22, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_mov_b32 s7, s25 +; SI-NEXT: s_mov_b32 s5, s27 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s22, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s14, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v8i16_to_v8f16_scalar: @@ -18184,79 +17955,63 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_alignbit_b32 v8, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v7, v6, v7, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_alignbit_b32 v8, v1, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v7, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8i16: @@ -18339,81 +18094,75 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v8i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v10, v5, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v8f16_to_v8i16_scalar: ; VI: ; %bb.0: @@ -21514,107 +21263,91 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v8 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21698,99 +21431,93 @@ define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i ; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s10, s16, 16 +; SI-NEXT: s_lshl_b32 s11, s6, 16 +; SI-NEXT: s_lshl_b32 s12, s17, 16 +; SI-NEXT: s_lshl_b32 s13, s7, 16 +; SI-NEXT: s_lshl_b32 s14, s18, 16 +; SI-NEXT: s_lshl_b32 s15, s8, 16 +; SI-NEXT: s_lshl_b32 s20, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v8f16_to_v8bf16_scalar: ; VI: ; %bb.0: @@ -21902,106 +21629,94 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_alignbit_b32 v5, v1, v9, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_alignbit_b32 v6, v5, v0, 16 +; SI-NEXT: v_alignbit_b32 v4, v3, v12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v7, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v15, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v13, 16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v5, v1, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v6, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, v4, v7, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -22364,93 +22079,79 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshr_b64 v[18:19], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshr_b64 v[9:10], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[2:3], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8f16_scalar: @@ -22867,26 +22568,14 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -22912,14 +22601,18 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v8, v20, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v4, v16, v1 -; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v12, v5, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -22931,18 +22624,18 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22953,7 +22646,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v8, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 @@ -23259,53 +22952,41 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: s_lshr_b32 s28, s19, 16 +; SI-NEXT: s_lshr_b32 s29, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v19, v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v8, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_or_b32_e32 v17, v25, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v24, v1 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s12, s7, s9 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_or_b32 s13, s7, s9 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[12:13], 8 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s9, s13, 8 +; SI-NEXT: s_bfe_u32 s11, s26, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s28, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23316,12 +22997,12 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v17, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: v_or_b32_e32 v18, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23335,37 +23016,55 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; SI-NEXT: v_or_b32_e32 v20, v2, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 ; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v20, s5 +; SI-NEXT: v_mov_b32_e32 v19, s4 +; SI-NEXT: v_mov_b32_e32 v18, s13 +; SI-NEXT: v_mov_b32_e32 v17, s12 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: .LBB105_5: ; %end ; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v19 ; SI-NEXT: v_mov_b32_e32 v4, v20 ; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v10, v12 ; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; VI: ; %bb.0: @@ -23623,50 +23322,58 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v3 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v6, v20, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v7, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -23675,77 +23382,82 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: @@ -24165,120 +23877,136 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v0 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s40 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_lshr_b64 s[8:9], s[40:41], 16 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s15, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s14, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_or_b32 s43, s5, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s9, 16 +; SI-NEXT: s_mov_b32 s7, s41 +; SI-NEXT: s_mov_b32 s5, s43 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s14, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s20, 0xff -; SI-NEXT: s_lshl_b32 s10, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s18, 0xff -; SI-NEXT: s_lshl_b32 s11, s19, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s16, 0xff -; SI-NEXT: s_lshl_b32 s12, s17, 8 -; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_and_b32 s9, s22, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s9, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v16i8_to_v8f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index 8fbab2d6ab753..430a93d9e9bf0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -995,86 +995,50 @@ define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10f16: @@ -1152,76 +1116,51 @@ define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s13, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s8, s4 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s12, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v5i32_to_v10f16_scalar: @@ -1314,31 +1253,16 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1351,33 +1275,38 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1385,10 +1314,10 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1396,12 +1325,12 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1502,50 +1431,35 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-LABEL: bitcast_v10f16_to_v5i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1553,10 +1467,10 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1564,29 +1478,35 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5i32_scalar: ; VI: ; %bb.0: @@ -2309,94 +2229,58 @@ define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v5f32_to_v10f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v10f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 @@ -2462,79 +2346,61 @@ define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v9, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10f16_scalar: ; VI: ; %bb.0: @@ -2636,31 +2502,16 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2673,33 +2524,38 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2707,10 +2563,10 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2718,12 +2574,12 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2824,50 +2680,35 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v5f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2875,10 +2716,10 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2886,29 +2727,35 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5f32_scalar: ; VI: ; %bb.0: @@ -3025,92 +2872,97 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v10i16_to_v10f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v1, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v12, v10, v17, 16 +; SI-NEXT: v_alignbit_b32 v13, v5, v19, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v12, v10, v11, 16 +; SI-NEXT: v_alignbit_b32 v13, v5, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v10f16: @@ -3197,79 +3049,93 @@ define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i ; SI-LABEL: bitcast_v10i16_to_v10f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s22, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s21, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s26, s23, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_or_b32 s6, s4, s26 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s28, s24, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s28 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 16 +; SI-NEXT: s_mov_b32 s7, s27 +; SI-NEXT: s_mov_b32 s5, s29 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s6, s22, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s14, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s21, s8, 0x30000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s15, s7, 16 +; SI-NEXT: s_lshr_b32 s22, s5, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s22, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s21, 0xffff +; SI-NEXT: s_lshl_b32 s9, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v10i16_to_v10f16_scalar: @@ -3389,95 +3255,75 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_alignbit_b32 v10, v2, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_alignbit_b32 v10, v1, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v9, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v10i16: @@ -3565,97 +3411,89 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v10i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v12, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_or_b32_e32 v11, v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_or_b32_e32 v10, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 ; SI-NEXT: v_or_b32_e32 v5, v14, v13 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v10f16_to_v10i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index ed44b1c0b294a..b6b321a08f7aa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -10,22 +10,15 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) { ; SI-LABEL: bitcast_i16_to_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_f16: @@ -114,16 +107,13 @@ define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) { ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %cmp.true ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: .LBB1_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_i16_to_f16_scalar: @@ -195,17 +185,27 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_3 +; SI-NEXT: ; %bb.1: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_cbranch_execnz .LBB2_4 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB2_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: .LBB2_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -291,20 +291,22 @@ define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cbranch_execnz .LBB3_4 ; SI-NEXT: .LBB3_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: .LBB3_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_4: +; SI-NEXT: .LBB3_3: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_i16_scalar: ; VI: ; %bb.0: @@ -838,26 +840,24 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -943,25 +943,26 @@ define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_cbranch_scc0 .LBB9_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB9_4 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_branch .LBB9_5 +; SI-NEXT: .LBB9_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB9_2 +; SI-NEXT: .LBB9_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: .LBB9_5: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_f16_to_bf16_scalar: ; VI: ; %bb.0: @@ -1059,20 +1060,23 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_f16: @@ -1196,22 +1200,19 @@ define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: .LBB11_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_bf16_to_f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 94ccde5a0a948..d463b115d1088 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1890,100 +1890,57 @@ define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12f16: @@ -2064,89 +2021,59 @@ define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v6i32_to_v12f16_scalar: @@ -2244,36 +2171,18 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2286,37 +2195,43 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2325,31 +2240,31 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2454,57 +2369,39 @@ define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v6i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2513,47 +2410,54 @@ define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6i32_scalar: ; VI: ; %bb.0: @@ -4162,100 +4066,57 @@ define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12f16: @@ -4331,92 +4192,70 @@ define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v11, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s21, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v6f32_to_v12f16_scalar: ; VI: ; %bb.0: @@ -4522,36 +4361,18 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4564,37 +4385,43 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4603,31 +4430,31 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4732,57 +4559,39 @@ define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v6f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4791,47 +4600,54 @@ define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6f32_scalar: ; VI: ; %bb.0: @@ -6009,47 +5825,22 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true @@ -6059,50 +5850,32 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12f16: @@ -6185,89 +5958,59 @@ define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v3i64_to_v12f16_scalar: @@ -6365,36 +6108,18 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6407,37 +6132,43 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6446,31 +6177,31 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6575,57 +6306,39 @@ define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v3i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6634,47 +6347,54 @@ define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3i64_scalar: ; VI: ; %bb.0: @@ -7447,94 +7167,54 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: .LBB52_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12f16: @@ -7607,89 +7287,67 @@ define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: s_cbranch_scc0 .LBB53_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_4 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_branch .LBB53_5 +; SI-NEXT: .LBB53_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB53_2 +; SI-NEXT: .LBB53_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: .LBB53_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v3f64_to_v12f16_scalar: ; VI: ; %bb.0: @@ -7786,36 +7444,18 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7828,37 +7468,43 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7867,31 +7513,31 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -7996,57 +7642,39 @@ define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, ; SI-LABEL: bitcast_v12f16_to_v3f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8055,47 +7683,54 @@ define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3f64_scalar: ; VI: ; %bb.0: @@ -8222,107 +7857,114 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v13, v1, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v11, v1, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v16, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v6, v1, v22 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v14, v13, v18, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v23, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v23 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v14, v13, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: @@ -8414,92 +8056,110 @@ define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i ; SI-LABEL: bitcast_v12i16_to_v12f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s25, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s24, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s27, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s8, s4, s40 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s28, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s26, 16 +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s29, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[44:45], 16 +; SI-NEXT: s_mov_b32 s9, s41 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_mov_b32 s5, s45 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s25, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s27, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s24, s9, 16 +; SI-NEXT: s_lshr_b32 s25, s7, 16 +; SI-NEXT: s_lshr_b32 s26, s5, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s10, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v12i16_to_v12f16_scalar: @@ -8627,113 +8287,89 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_or_b32_e32 v9, v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 ; SI-NEXT: v_or_b32_e32 v4, v4, v10 -; SI-NEXT: v_alignbit_b32 v12, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v10, 16 +; SI-NEXT: v_alignbit_b32 v12, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v11, 16 +; SI-NEXT: v_alignbit_b32 v10, v5, v10, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v12i16: @@ -8826,114 +8462,104 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v12i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v16, v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v15, v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_or_b32_e32 v12, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v14, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 ; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 ; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v12f16_to_v12i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index cd5f3490a69e9..e0fac42ac9d77 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1165,114 +1165,64 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: @@ -1357,102 +1307,67 @@ define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s15, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v7i32_to_v14f16_scalar: @@ -1556,41 +1471,20 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1603,20 +1497,34 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -1624,20 +1532,13 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1646,25 +1547,25 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1672,12 +1573,12 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1789,64 +1690,43 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-LABEL: bitcast_v14f16_to_v7i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1855,25 +1735,25 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1881,29 +1761,37 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7i32_scalar: ; VI: ; %bb.0: @@ -2766,114 +2654,64 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: @@ -2952,105 +2790,79 @@ define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s22, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v7f32_to_v14f16_scalar: ; VI: ; %bb.0: @@ -3160,41 +2972,20 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3207,20 +2998,34 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -3228,20 +3033,13 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3250,25 +3048,25 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3276,12 +3074,12 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3393,64 +3191,43 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v7f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3459,25 +3236,25 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3485,29 +3262,37 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7f32_scalar: ; VI: ; %bb.0: @@ -3645,46 +3430,52 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v15, v1, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v13, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v14, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v7, v1, v26 +; SI-NEXT: v_or_b32_e32 v11, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v17, v15, v22, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v24, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v27, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3692,75 +3483,76 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v17, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: @@ -3858,105 +3650,125 @@ define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i ; SI-LABEL: bitcast_v14i16_to_v14f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s22, 16 +; SI-NEXT: s_lshr_b32 s29, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s28, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_and_b32 s5, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s26, 16 +; SI-NEXT: s_or_b32 s23, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s27, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s40, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s8, s4, s44 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s41, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_or_b32 s6, s4, s46 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s42, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_lshr_b64 s[10:11], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[56:57], 16 +; SI-NEXT: s_mov_b32 s9, s45 +; SI-NEXT: s_mov_b32 s7, s47 +; SI-NEXT: s_mov_b32 s5, s57 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s26, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s23, s10, 0x30000 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s27, s9, 16 +; SI-NEXT: s_lshr_b32 s28, s7, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s10, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: s_and_b32 s10, s23, 0xffff +; SI-NEXT: s_lshl_b32 s11, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v14i16_to_v14f16_scalar: @@ -4094,128 +3906,100 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 ; SI-NEXT: v_or_b32_e32 v4, v4, v12 -; SI-NEXT: v_alignbit_b32 v14, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v13, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v12, v8, v12, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v13, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v12, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v14i16: @@ -4314,130 +4098,118 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v18, v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v13, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_or_b32_e32 v14, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 ; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 075216fc4791c..dd6846e7d0537 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -2149,128 +2149,71 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16f16: @@ -2358,115 +2301,75 @@ define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v8i32_to_v16f16_scalar: @@ -2575,46 +2478,22 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2627,22 +2506,38 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -2651,21 +2546,13 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2678,10 +2565,10 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -2690,18 +2577,18 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2709,11 +2596,11 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2830,71 +2717,47 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v8i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2902,10 +2765,10 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2913,11 +2776,11 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2925,11 +2788,11 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2937,29 +2800,38 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8i32_scalar: ; VI: ; %bb.0: @@ -9512,128 +9384,71 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: @@ -9714,118 +9529,88 @@ define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: v_mov_b32_e32 v14, s15 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v8f32_to_v16f16_scalar: ; VI: ; %bb.0: @@ -9939,46 +9724,22 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9991,22 +9752,38 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -10015,21 +9792,13 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10042,10 +9811,10 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -10054,18 +9823,18 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10073,11 +9842,11 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10194,71 +9963,47 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v8f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10266,10 +10011,10 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10277,11 +10022,11 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10289,11 +10034,11 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10301,29 +10046,38 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8f32_scalar: ; VI: ; %bb.0: @@ -16462,59 +16216,26 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true @@ -16526,64 +16247,40 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: @@ -16673,115 +16370,75 @@ define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v4i64_to_v16f16_scalar: @@ -16890,46 +16547,22 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -16942,22 +16575,38 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -16966,21 +16615,13 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16993,10 +16634,10 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -17005,18 +16646,18 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -17024,11 +16665,11 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -17145,71 +16786,47 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v4i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -17217,10 +16834,10 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -17228,11 +16845,11 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -17240,11 +16857,11 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -17252,29 +16869,38 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4i64_scalar: ; VI: ; %bb.0: @@ -22903,120 +22529,67 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: @@ -23092,114 +22665,84 @@ define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v4f64_to_v16f16_scalar: ; VI: ; %bb.0: @@ -23301,46 +22844,22 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -23353,22 +22872,38 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -23377,21 +22912,13 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23404,10 +22931,10 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -23416,18 +22943,18 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -23435,11 +22962,11 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -23556,71 +23083,47 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-LABEL: bitcast_v16f16_to_v4f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -23628,10 +23131,10 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -23639,11 +23142,11 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -23651,11 +23154,11 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -23663,29 +23166,38 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4f64_scalar: ; VI: ; %bb.0: @@ -28561,51 +28073,59 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v17, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v15, v1, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v22, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v13, v1, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v18, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v8, v1, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v19, v17, v24, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v26, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v28, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v30, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v30 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -28614,84 +28134,85 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v19, v17, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v16, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16f16: @@ -28794,118 +28315,142 @@ define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i ; SI-LABEL: bitcast_v16i16_to_v16f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s47, s22, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s44, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s10, s4, s56 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s58, s45, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s42, 16 +; SI-NEXT: s_or_b32 s8, s4, s58 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s60, s46, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s43, 16 +; SI-NEXT: s_or_b32 s6, s4, s60 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s62, s47, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s62 +; SI-NEXT: s_lshr_b64 s[12:13], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[62:63], 16 +; SI-NEXT: s_mov_b32 s11, s57 +; SI-NEXT: s_mov_b32 s9, s59 +; SI-NEXT: s_mov_b32 s7, s61 +; SI-NEXT: s_mov_b32 s5, s63 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s43, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s42, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s9, 16 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v16i16_to_v16f16_scalar: @@ -29051,146 +28596,114 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_or_b32_e32 v12, v12, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 ; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_alignbit_b32 v16, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 -; SI-NEXT: v_alignbit_b32 v14, v6, v14, 16 -; SI-NEXT: v_alignbit_b32 v13, v10, v13, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_alignbit_b32 v16, v1, v8, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v15, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v13, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16i16: @@ -29294,147 +28807,133 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v17, v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v16, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v23, v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v20, v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v22, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_or_b32_e32 v19, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_or_b32_e32 v18, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v20, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v7, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 -; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v21, s8 +; SI-NEXT: v_mov_b32_e32 v22, s7 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v8, s9 +; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; VI: ; %bb.0: @@ -34904,91 +34403,59 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -35001,103 +34468,103 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v10 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v16 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v18 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35202,183 +34669,169 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: s_lshl_b32 s24, s17, 16 +; SI-NEXT: s_lshl_b32 s25, s7, 16 +; SI-NEXT: s_lshl_b32 s26, s18, 16 +; SI-NEXT: s_lshl_b32 s27, s8, 16 +; SI-NEXT: s_lshl_b32 s28, s19, 16 +; SI-NEXT: s_lshl_b32 s29, s9, 16 +; SI-NEXT: s_lshl_b32 s40, s20, 16 +; SI-NEXT: s_lshl_b32 s41, s10, 16 +; SI-NEXT: s_lshl_b32 s42, s21, 16 +; SI-NEXT: s_lshl_b32 s43, s11, 16 +; SI-NEXT: s_lshl_b32 s44, s22, 16 +; SI-NEXT: s_lshl_b32 s45, s12, 16 +; SI-NEXT: s_lshl_b32 s46, s23, 16 +; SI-NEXT: s_lshl_b32 s47, s14, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v8, s46 +; SI-NEXT: v_mov_b32_e32 v10, s45 +; SI-NEXT: v_mov_b32_e32 v7, s44 +; SI-NEXT: v_mov_b32_e32 v11, s43 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v12, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: .LBB101_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[15:16], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[13:14], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v16f16_to_v16bf16_scalar: ; VI: ; %bb.0: @@ -35540,195 +34993,171 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v11, v1, v17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_alignbit_b32 v12, v11, v0, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_alignbit_b32 v13, v10, v0, 16 +; SI-NEXT: v_alignbit_b32 v6, v5, v21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_alignbit_b32 v14, v6, v0, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_alignbit_b32 v15, v8, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_alignbit_b32 v0, v0, v31, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v30, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v28, 16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v23 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v9, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v8, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v6, v5, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v10, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v11, v1, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v12, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, v6, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, v8, v15, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16f16: @@ -36366,173 +35795,145 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v39 +; SI-NEXT: v_lshr_b64 v[13:14], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshr_b64 v[22:23], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[13:14], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16f16_scalar: @@ -37266,46 +36667,22 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -37345,22 +36722,30 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v8, v36, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v35, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v16, v39, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v38, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v50, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v4, v32, v1 -; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v8, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v16, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v28, v5, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -37381,25 +36766,25 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -37407,9 +36792,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_or_b32_e32 v24, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -37421,28 +36806,28 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v16, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v20, v0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 @@ -37937,219 +37322,221 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s22, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s20, 16 +; SI-NEXT: s_lshr_b32 s74, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s72, s17, 16 +; SI-NEXT: s_lshr_b32 s73, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_or_b32_e32 v48, v8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v35, v39, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v32, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v37, v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v53, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_or_b32_e32 v33, v41, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_or_b32_e32 v34, v40, v2 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], 16 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s26, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s74, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: s_or_b32 s44, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s60, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[60:61], 8 +; SI-NEXT: s_lshr_b32 s5, s11, 8 +; SI-NEXT: s_lshr_b32 s7, s27, 8 +; SI-NEXT: s_lshr_b32 s13, s45, 8 +; SI-NEXT: s_lshr_b32 s15, s61, 8 +; SI-NEXT: s_bfe_u32 s9, s72, 0x80008 +; SI-NEXT: s_bfe_u32 s25, s74, 0x80008 +; SI-NEXT: s_bfe_u32 s29, s76, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s78, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v33, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_or_b32_e32 v34, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_lshr_b64 v[24:25], v[33:34], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 -; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_or_b32_e32 v35, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_or_b32_e32 v34, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_or_b32_e32 v36, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v35, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_or_b32_e32 v38, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v48, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 +; SI-NEXT: v_or_b32_e32 v48, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_or_b32_e32 v49, v1, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v30, s78 +; SI-NEXT: v_mov_b32_e32 v22, s76 +; SI-NEXT: v_mov_b32_e32 v14, s74 +; SI-NEXT: v_mov_b32_e32 v6, s72 +; SI-NEXT: v_mov_b32_e32 v31, s41 +; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v49, s11 +; SI-NEXT: v_mov_b32_e32 v48, s10 +; SI-NEXT: v_mov_b32_e32 v38, s27 +; SI-NEXT: v_mov_b32_e32 v37, s26 +; SI-NEXT: v_mov_b32_e32 v36, s45 +; SI-NEXT: v_mov_b32_e32 v35, s44 +; SI-NEXT: v_mov_b32_e32 v34, s61 +; SI-NEXT: v_mov_b32_e32 v33, s60 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v17, s42 +; SI-NEXT: v_mov_b32_e32 v27, s46 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v25, s58 +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v8, v35 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: v_mov_b32_e32 v12, v36 -; SI-NEXT: v_mov_b32_e32 v16, v37 -; SI-NEXT: v_mov_b32_e32 v18, v24 -; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v12, v38 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v35 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v26, v24 ; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v50 ; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; VI: ; %bb.0: @@ -38588,95 +37975,111 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v19 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v55 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v23, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v9, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v10, v37, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v15, v0, v9 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v11, v0, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v12, v52, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v13, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v14, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -38694,140 +38097,149 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 ; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v51, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 ; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v13, v1, v23, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: @@ -39537,240 +38949,272 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_readfirstlane_b32 s46, v17 -; SI-NEXT: v_readfirstlane_b32 s47, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v13 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s40, v11 -; SI-NEXT: v_readfirstlane_b32 s41, v10 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s63, v17 +; SI-NEXT: v_readfirstlane_b32 s62, v16 +; SI-NEXT: v_readfirstlane_b32 s74, v15 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s77, v13 +; SI-NEXT: v_readfirstlane_b32 s76, v12 +; SI-NEXT: v_readfirstlane_b32 s78, v11 +; SI-NEXT: v_readfirstlane_b32 s79, v10 +; SI-NEXT: v_readfirstlane_b32 s57, v9 +; SI-NEXT: v_readfirstlane_b32 s56, v8 +; SI-NEXT: v_readfirstlane_b32 s58, v7 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s61, v5 +; SI-NEXT: v_readfirstlane_b32 s60, v4 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_readfirstlane_b32 s73, v2 +; SI-NEXT: v_readfirstlane_b32 s46, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s47, v0 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s8, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s21, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s22, 0xff ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s23, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s44, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s47, 0xff +; SI-NEXT: s_or_b32 s10, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s46, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s45, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s15, s8, s45 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: s_lshl_b32 s9, s58, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s14, s6, s5 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s56, 0xff +; SI-NEXT: s_or_b32 s6, s4, s14 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s57, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s88, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s61, 24 +; SI-NEXT: s_or_b32 s41, s8, s88 +; SI-NEXT: s_and_b32 s8, s75, 0xff +; SI-NEXT: s_lshl_b32 s9, s74, 8 +; SI-NEXT: s_or_b32 s40, s7, s5 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s78, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s62, 0xff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s63, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s89, s11, s9 +; SI-NEXT: s_lshl_b32 s9, s77, 24 +; SI-NEXT: s_or_b32 s42, s9, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s43, s7, s89 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: s_or_b32 s8, s5, s42 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; SI-NEXT: s_mov_b32 s5, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s15, s44, 16 +; SI-NEXT: s_lshr_b32 s41, s45, 16 +; SI-NEXT: s_lshr_b32 s43, s88, 16 +; SI-NEXT: s_lshr_b32 s13, s89, 16 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xff -; SI-NEXT: s_lshl_b32 s11, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s24, 0xff -; SI-NEXT: s_lshl_b32 s13, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s22, 0xff -; SI-NEXT: s_lshl_b32 s15, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_add_i32 s76, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s43, s43, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s41, s41, 0xff -; SI-NEXT: s_lshl_b32 s40, s40, 8 -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: s_and_b32 s15, s20, 0xff -; SI-NEXT: s_lshl_b32 s20, s21, 8 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s5, s44, s5 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_or_b32 s15, s20, s15 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s6, s76, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s77, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s63, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s60, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s61, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: s_lshl_b32 s6, s58, 8 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s56, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s40, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s6, s57, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s10, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s11, s47, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s10, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s10, s16, 0xff +; SI-NEXT: s_lshl_b32 s11, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s18, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s19, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s20, 0xff +; SI-NEXT: s_lshl_b32 s12, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s22, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshl_b32 s12, s23, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s15, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s15, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s41, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s13, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v32i8_to_v16f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 6c8abf8733579..6656733d53e51 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1330,142 +1330,78 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: @@ -1556,128 +1492,83 @@ define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v9i32_to_v18f16_scalar: @@ -1792,51 +1683,24 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1849,24 +1713,42 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v22, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -1876,22 +1758,13 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1904,10 +1777,10 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -1916,25 +1789,25 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1942,12 +1815,12 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2070,121 +1943,94 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-LABEL: bitcast_v18f16_to_v9i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2192,29 +2038,39 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9i32_scalar: ; VI: ; %bb.0: @@ -3233,142 +3089,78 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: @@ -3452,131 +3244,97 @@ define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v17, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v9f32_to_v18f16_scalar: ; VI: ; %bb.0: @@ -3713,51 +3471,24 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3770,24 +3501,42 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v22, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -3797,22 +3546,13 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3825,10 +3565,10 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3837,25 +3577,25 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3863,12 +3603,12 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3991,121 +3731,94 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v9f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4113,29 +3826,39 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9f32_scalar: ; VI: ; %bb.0: @@ -4293,56 +4016,64 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v19, v1, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v22, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v17, v1, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v20, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v15, v1, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v1, v35 +; SI-NEXT: v_or_b32_e32 v14, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v21, v19, v28, 16 +; SI-NEXT: v_alignbit_b32 v23, v17, v30, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v34, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4352,93 +4083,94 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v2, v31, v2 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v21, v19, v22, 16 +; SI-NEXT: v_alignbit_b32 v23, v17, v20, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: @@ -4546,131 +4278,157 @@ define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i ; SI-LABEL: bitcast_v18i16_to_v18f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s20, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b32 s47, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: s_and_b32 s5, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s42, 16 +; SI-NEXT: s_or_b32 s25, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s43, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s47, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s60, s56, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s45, 16 +; SI-NEXT: s_or_b32 s8, s4, s60 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s62, s57, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_or_b32 s6, s4, s62 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s72, s58, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s72 +; SI-NEXT: s_lshr_b64 s[12:13], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[72:73], 16 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_mov_b32 s9, s61 +; SI-NEXT: s_mov_b32 s7, s63 +; SI-NEXT: s_mov_b32 s5, s73 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s46, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s45, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s56, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s47, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s25, s12, 0x30000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s43, s11, 16 +; SI-NEXT: s_lshr_b32 s44, s9, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_and_b32 s12, s25, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v18i16_to_v18f16_scalar: @@ -4836,162 +4594,126 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v18i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v11, v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 ; SI-NEXT: v_or_b32_e32 v4, v4, v16 ; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_alignbit_b32 v18, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v17, v14, v17, 16 -; SI-NEXT: v_alignbit_b32 v16, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v5, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v7, v15, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_or_b32_e32 v2, v2, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 ; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v18i16: @@ -5101,163 +4823,147 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v17, v8, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v16, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v19, v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v22, v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v8, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_or_b32_e32 v18, v3, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_or_b32_e32 v20, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_or_b32_e32 v19, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_or_b32_e32 v7, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s24 +; SI-NEXT: v_or_b32_e32 v5, v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v23, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v24, v1, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v25, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v24, s24 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 20a8e6dc2727e..bb0e13e2997e7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1419,156 +1419,86 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v20f16: @@ -1664,141 +1594,91 @@ define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v10i32_to_v20f16_scalar: @@ -1918,56 +1798,26 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1980,26 +1830,46 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -2010,23 +1880,13 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2035,25 +1895,25 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2061,11 +1921,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2073,11 +1933,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2085,12 +1945,12 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -2218,85 +2078,55 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v10i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2305,25 +2135,25 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2331,11 +2161,11 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2343,11 +2173,11 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2355,29 +2185,40 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v10i32_scalar: ; VI: ; %bb.0: @@ -8470,171 +8311,101 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v10f32_to_v20f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -8707,144 +8478,106 @@ define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v19, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 ; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v10f32_to_v20f16_scalar: ; VI: ; %bb.0: @@ -8984,56 +8717,26 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9046,54 +8749,64 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_2 -; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -9101,25 +8814,25 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -9127,11 +8840,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9139,11 +8852,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -9151,12 +8864,12 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9284,85 +8997,55 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v10f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9371,25 +9054,25 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -9397,11 +9080,11 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9409,11 +9092,11 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -9421,29 +9104,40 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v10f32_scalar: ; VI: ; %bb.0: @@ -14680,61 +14374,71 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v20, v1, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v29, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v17, v1, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v24, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v14, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v1, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v23, v22, v30, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v32, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v34, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v36, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v38, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v38 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -14745,102 +14449,103 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v38 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v6, v36, v6 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v23, v22, v29, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v24, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i16_to_v20f16: @@ -14954,144 +14659,174 @@ define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i ; SI-LABEL: bitcast_v20i16_to_v20f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s63, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s62, s22, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s18, 16 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s28, s59, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s12, s4, s28 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s56, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s42, s61, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s8, s4, s42 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s58, 16 +; SI-NEXT: s_or_b32 s6, s4, s44 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s72, s63, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s72 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[40:41], 16 +; SI-NEXT: s_mov_b32 s13, s29 +; SI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[72:73], 16 +; SI-NEXT: s_mov_b32 s7, s45 +; SI-NEXT: s_mov_b32 s5, s73 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s62, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s57, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s61, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s56, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s60, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s47, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s59, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: s_lshr_b32 s47, s11, 16 +; SI-NEXT: s_lshr_b32 s56, s9, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s5, 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s47, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v20i16_to_v20f16_scalar: @@ -15265,178 +15000,138 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 ; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_or_b32_e32 v12, v12, v17 -; SI-NEXT: v_or_b32_e32 v13, v13, v16 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_alignbit_b32 v19, v1, v19, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 -; SI-NEXT: v_alignbit_b32 v17, v5, v17, 16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_alignbit_b32 v10, v1, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v18, 16 ; SI-NEXT: v_alignbit_b32 v16, v7, v16, 16 -; SI-NEXT: v_alignbit_b32 v14, v9, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v13, 16 ; SI-NEXT: .LBB46_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v15 -; SI-NEXT: v_or_b32_e32 v6, v6, v12 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15552,149 +15247,133 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s10, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v20, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_or_b32_e32 v7, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_or_b32_e32 v27, v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v29, v11, v2 -; SI-NEXT: v_or_b32_e32 v28, v10, v4 -; SI-NEXT: v_or_b32_e32 v26, v12, v6 -; SI-NEXT: v_or_b32_e32 v24, v13, v8 +; SI-NEXT: v_or_b32_e32 v29, v10, v2 +; SI-NEXT: v_or_b32_e32 v28, v11, v4 +; SI-NEXT: v_or_b32_e32 v27, v12, v6 +; SI-NEXT: v_or_b32_e32 v26, v13, v8 ; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s9 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s7 +; SI-NEXT: v_mov_b32_e32 v25, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v26, s24 +; SI-NEXT: v_mov_b32_e32 v27, s22 +; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v18, s15 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: .LBB47_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 @@ -15703,29 +15382,27 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; VI: ; %bb.0: @@ -22456,36 +22133,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v40i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -22494,356 +22141,354 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_or_b32_e32 v24, v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_or_b32_e32 v20, v49, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_or_b32_e32 v13, v53, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_or_b32_e32 v12, v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 -; SI-NEXT: v_or_b32_e32 v10, v40, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_or_b32_e32 v11, v55, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_or_b32_e32 v8, v43, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_or_b32_e32 v9, v42, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v7, v46, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 -; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 -; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 -; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 -; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 -; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v1, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v25, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v22, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v23, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_alignbit_b32 v32, v25, v27, 24 +; SI-NEXT: v_alignbit_b32 v37, v25, v27, 16 +; SI-NEXT: v_alignbit_b32 v49, v25, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, v23, v22, 24 +; SI-NEXT: v_alignbit_b32 v35, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v48, v23, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 24 +; SI-NEXT: v_alignbit_b32 v33, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 8 +; SI-NEXT: v_alignbit_b32 v24, v17, v11, 24 +; SI-NEXT: v_alignbit_b32 v28, v17, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v11, 8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 +; SI-NEXT: v_bfe_u32 v42, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v12, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v45 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v17, v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v18, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v19, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v20, v5, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v21, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v13, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v24, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v20, v16, v14 -; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 -; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 -; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 -; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 -; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 -; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_or_b32_e32 v23, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: v_alignbit_b32 v32, v25, v27, 24 +; SI-NEXT: v_alignbit_b32 v37, v25, v27, 16 +; SI-NEXT: v_alignbit_b32 v49, v25, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, v23, v22, 24 +; SI-NEXT: v_alignbit_b32 v35, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v48, v23, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 24 +; SI-NEXT: v_alignbit_b32 v33, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 8 +; SI-NEXT: v_alignbit_b32 v24, v17, v11, 24 +; SI-NEXT: v_alignbit_b32 v28, v17, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v11, 8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 +; SI-NEXT: v_bfe_u32 v42, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v12, 8, 8 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v26, v26, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v39 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v48 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v25 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v5, v5, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v37 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v19 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v36 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload @@ -23616,327 +23261,328 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s30, 0 +; SI-NEXT: v_writelane_b32 v12, s31, 1 +; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_writelane_b32 v12, s35, 3 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s24, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s94, s21, 16 +; SI-NEXT: s_lshr_b32 s95, s20, 16 +; SI-NEXT: s_lshr_b32 s92, s19, 16 +; SI-NEXT: s_lshr_b32 s93, s18, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s34, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; SI-NEXT: s_or_b32 s5, s5, s15 +; SI-NEXT: s_lshr_b32 s43, s13, 8 +; SI-NEXT: s_lshr_b32 s41, s11, 8 +; SI-NEXT: s_lshr_b32 s29, s9, 8 +; SI-NEXT: s_lshr_b32 s27, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: s_bfe_u32 s45, s90, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s30, 0x80008 +; SI-NEXT: s_bfe_u32 s61, s34, 0x80008 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: s_cbranch_execnz .LBB61_4 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 -; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB61_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v7 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v6 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: s_or_b32 s10, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v9 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s23, s13, 8 -; SI-NEXT: s_lshr_b32 s21, s11, 8 -; SI-NEXT: s_lshr_b32 s19, s9, 8 -; SI-NEXT: s_lshr_b32 s17, s7, 8 -; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB61_3 -; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s30 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s95 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s94 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s93 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s92 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s91 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v7 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v5 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s23, s13, 8 -; SI-NEXT: s_lshr_b32 s21, s11, 8 -; SI-NEXT: s_lshr_b32 s19, s9, 8 -; SI-NEXT: s_lshr_b32 s17, s7, 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s43, s13, 8 +; SI-NEXT: s_lshr_b32 s41, s11, 8 +; SI-NEXT: s_lshr_b32 s29, s9, 8 +; SI-NEXT: s_lshr_b32 s27, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 -; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_bfe_u32 v10, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v9, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v8, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v6, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v5, v1, 8, 8 +; SI-NEXT: s_branch .LBB61_5 +; SI-NEXT: .LBB61_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: s_branch .LBB61_2 +; SI-NEXT: .LBB61_4: +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s30 +; SI-NEXT: v_mov_b32_e32 v3, s94 +; SI-NEXT: v_mov_b32_e32 v4, s92 +; SI-NEXT: v_mov_b32_e32 v7, s90 +; SI-NEXT: v_mov_b32_e32 v5, s61 +; SI-NEXT: v_mov_b32_e32 v6, s59 +; SI-NEXT: v_mov_b32_e32 v8, s57 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s45 +; SI-NEXT: .LBB61_5: ; %end ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_lshl_b32 s16, s40, 8 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v11, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s23, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s43, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, s12, v7 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s26, 8 +; SI-NEXT: s_lshl_b32 s12, s58, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s22, 0xff +; SI-NEXT: s_and_b32 s12, s42, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s18, 24 +; SI-NEXT: s_lshl_b32 s13, s28, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v10, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s21, 8 +; SI-NEXT: s_lshl_b32 s11, s41, 8 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s40, 8 +; SI-NEXT: s_lshl_b32 s10, s60, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xff +; SI-NEXT: s_and_b32 s10, s46, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s24, 24 +; SI-NEXT: s_lshl_b32 s11, s44, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v7, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_lshl_b32 s9, s29, 8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s44, 8 +; SI-NEXT: s_lshl_b32 s8, s76, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_and_b32 s8, s72, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s46, 24 +; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 @@ -23946,21 +23592,21 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_lshl_b32 s7, s27, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: s_lshl_b32 s6, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s58, 0xff +; SI-NEXT: s_and_b32 s6, s74, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: s_lshl_b32 s7, s62, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -23975,46 +23621,21 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v12, 3 +; SI-NEXT: v_readlane_b32 s34, v12, 2 +; SI-NEXT: v_readlane_b32 s31, v12, 1 +; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; VI: ; %bb.0: @@ -24753,129 +24374,150 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v29 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v31 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v35 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v35 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v57 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v41 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v29, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v39, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v21, v0, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v18, v56, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_or_b32_e32 v11, v52, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v14, v47, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v20, v59, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v13, v46, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v17, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v20, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -24893,191 +24535,200 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 -; SI-NEXT: v_add_i32_e32 v35, vcc, 0x300, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v17, v1, v29, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26032,301 +25683,365 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v40i8_to_v20f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v27, s30, 0 +; SI-NEXT: v_writelane_b32 v27, s31, 1 +; SI-NEXT: v_writelane_b32 v27, s34, 2 +; SI-NEXT: v_writelane_b32 v27, s35, 3 +; SI-NEXT: v_writelane_b32 v27, s36, 4 +; SI-NEXT: v_writelane_b32 v27, s37, 5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: v_readfirstlane_b32 s62, v25 -; SI-NEXT: v_readfirstlane_b32 s63, v24 -; SI-NEXT: v_readfirstlane_b32 s60, v23 -; SI-NEXT: v_readfirstlane_b32 s61, v22 -; SI-NEXT: v_readfirstlane_b32 s58, v21 -; SI-NEXT: v_readfirstlane_b32 s59, v20 -; SI-NEXT: v_readfirstlane_b32 s56, v19 -; SI-NEXT: v_readfirstlane_b32 s57, v18 -; SI-NEXT: v_readfirstlane_b32 s46, v17 -; SI-NEXT: v_readfirstlane_b32 s47, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v13 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s41, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_writelane_b32 v27, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s91, v25 +; SI-NEXT: v_readfirstlane_b32 s90, v24 +; SI-NEXT: v_readfirstlane_b32 s94, v23 +; SI-NEXT: v_readfirstlane_b32 s95, v22 +; SI-NEXT: v_readfirstlane_b32 s31, v21 +; SI-NEXT: v_readfirstlane_b32 s30, v20 +; SI-NEXT: v_readfirstlane_b32 s34, v19 +; SI-NEXT: v_readfirstlane_b32 s35, v18 +; SI-NEXT: v_readfirstlane_b32 s75, v17 +; SI-NEXT: v_readfirstlane_b32 s74, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v14 +; SI-NEXT: v_readfirstlane_b32 s89, v13 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s92, v11 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_readfirstlane_b32 s61, v9 +; SI-NEXT: v_readfirstlane_b32 s60, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v7 +; SI-NEXT: v_readfirstlane_b32 s63, v6 +; SI-NEXT: v_readfirstlane_b32 s73, v5 +; SI-NEXT: v_readfirstlane_b32 s72, v4 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: v_readfirstlane_b32 s58, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_readfirstlane_b32 s59, v0 +; SI-NEXT: v_writelane_b32 v27, s39, 7 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s56, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s61, 0xff -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s9, s5, s6 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s7, s76, 8 +; SI-NEXT: s_or_b32 s10, s5, s7 +; SI-NEXT: s_and_b32 s5, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s73, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s89, 24 +; SI-NEXT: s_or_b32 s44, s7, s5 +; SI-NEXT: s_and_b32 s5, s35, 0xff +; SI-NEXT: s_lshl_b32 s7, s34, 8 +; SI-NEXT: s_or_b32 s12, s5, s7 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s13, s23, 24 +; SI-NEXT: s_or_b32 s56, s13, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s13, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s58, 24 +; SI-NEXT: s_or_b32 s57, s14, s13 +; SI-NEXT: s_and_b32 s13, s63, 0xff +; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s60, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s61, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s15, s14 +; SI-NEXT: s_or_b32 s43, s13, vcc_lo +; SI-NEXT: s_and_b32 s13, s79, 0xff +; SI-NEXT: s_lshl_b32 s14, s78, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s74, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s75, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s15, s14 +; SI-NEXT: s_or_b32 s45, s13, vcc_hi +; SI-NEXT: s_and_b32 s13, s95, 0xff +; SI-NEXT: s_lshl_b32 s14, s94, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s90, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s91, 24 +; SI-NEXT: s_or_b32 s36, s15, s14 +; SI-NEXT: s_and_b32 s14, s30, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s31, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s56 +; SI-NEXT: s_or_b32 s7, s7, s57 +; SI-NEXT: s_or_b32 s46, s15, s14 +; SI-NEXT: s_or_b32 s47, s13, s36 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s37, s10, 0xffff +; SI-NEXT: s_and_b32 s38, s11, 0xffff +; SI-NEXT: s_and_b32 s39, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_or_b32 s10, s9, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_or_b32 s8, s37, s42 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_or_b32 s6, s38, s44 +; SI-NEXT: s_mov_b32 s7, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_or_b32 s4, s39, s46 +; SI-NEXT: s_mov_b32 s5, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_lshr_b32 s41, s56, 16 +; SI-NEXT: s_lshr_b32 s43, s57, 16 +; SI-NEXT: s_lshr_b32 s45, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s15, s36, 16 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_or_b32 s8, s8, s11 -; SI-NEXT: s_and_b32 s11, s40, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_or_b32 s6, s6, s11 -; SI-NEXT: s_and_b32 s11, s13, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s28, 0xff -; SI-NEXT: s_lshl_b32 s11, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_add_i32 s30, s30, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s30, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s31, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s95, 0xff +; SI-NEXT: s_lshl_b32 s6, s94, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s90, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s88, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s89, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s74, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s75, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s77, 0xff +; SI-NEXT: s_lshl_b32 s9, s76, 8 +; SI-NEXT: s_add_i32 s72, s72, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s10, s72, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s73, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s63, s63, 3 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s26, 0xff -; SI-NEXT: s_lshl_b32 s13, s27, 8 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s63, 0xff +; SI-NEXT: s_lshl_b32 s10, s62, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s11, s60, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s61, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s24, 0xff +; SI-NEXT: s_lshl_b32 s11, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s26, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s27, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s28, 0xff +; SI-NEXT: s_lshl_b32 s12, s29, 8 ; SI-NEXT: s_add_i32 s59, s59, 3 -; SI-NEXT: s_add_i32 s57, s57, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s24, 0xff -; SI-NEXT: s_lshl_b32 s14, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s58, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s61, 0xff -; SI-NEXT: s_lshl_b32 s60, s60, 8 -; SI-NEXT: s_and_b32 s59, s59, 0xff -; SI-NEXT: s_lshl_b32 s58, s58, 8 -; SI-NEXT: s_and_b32 s57, s57, 0xff -; SI-NEXT: s_lshl_b32 s56, s56, 8 -; SI-NEXT: s_and_b32 s47, s47, 0xff -; SI-NEXT: s_lshl_b32 s46, s46, 8 -; SI-NEXT: s_and_b32 s45, s45, 0xff -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s43, s43, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s41, s41, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s22, 0xff -; SI-NEXT: s_lshl_b32 s22, s23, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s21, s21, 8 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_or_b32 s58, s58, s59 -; SI-NEXT: s_or_b32 s56, s56, s57 -; SI-NEXT: s_or_b32 s46, s46, s47 -; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s15, s15, s41 -; SI-NEXT: s_or_b32 s14, s22, s14 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_addk_i32 s46, 0x300 -; SI-NEXT: s_addk_i32 s44, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s18, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s13, s19, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s20, 0xff +; SI-NEXT: s_lshl_b32 s14, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s15, s22, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s41, s13, 16 +; SI-NEXT: s_lshr_b32 s43, s11, 16 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: .LBB63_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s41, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s43, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s45, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s47, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_readlane_b32 s39, v27, 7 +; SI-NEXT: v_readlane_b32 s38, v27, 6 +; SI-NEXT: v_readlane_b32 s37, v27, 5 +; SI-NEXT: v_readlane_b32 s36, v27, 4 +; SI-NEXT: v_readlane_b32 s35, v27, 3 +; SI-NEXT: v_readlane_b32 s34, v27, 2 +; SI-NEXT: v_readlane_b32 s31, v27, 1 +; SI-NEXT: v_readlane_b32 s30, v27, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v40i8_to_v20f16_scalar: @@ -27135,56 +26850,26 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -27197,36 +26882,36 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -27237,13 +26922,23 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -27256,10 +26951,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -27267,10 +26962,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27278,11 +26973,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27290,11 +26985,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27302,11 +26997,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -27435,124 +27130,106 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v5f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB65_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB65_4 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27560,11 +27237,11 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27572,12 +27249,12 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -27590,11 +27267,41 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB65_5 +; SI-NEXT: .LBB65_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB65_2 +; SI-NEXT: .LBB65_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB65_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5f64_scalar: ; VI: ; %bb.0: @@ -27777,65 +27484,30 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 @@ -27843,80 +27515,50 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: @@ -27995,139 +27637,101 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: s_cbranch_scc0 .LBB67_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB67_4 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: .LBB67_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: s_branch .LBB67_5 +; SI-NEXT: .LBB67_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB67_2 +; SI-NEXT: .LBB67_4: +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB67_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v5f64_to_v20f16_scalar: ; VI: ; %bb.0: @@ -28252,56 +27856,26 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28314,36 +27888,36 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -28354,13 +27928,23 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: .LBB68_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28373,10 +27957,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28384,10 +27968,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28395,11 +27979,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28407,11 +27991,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -28419,11 +28003,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -28552,124 +28136,106 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v5i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB69_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB69_4 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28677,11 +28243,11 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -28689,12 +28255,12 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -28707,11 +28273,41 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB69_5 +; SI-NEXT: .LBB69_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB69_2 +; SI-NEXT: .LBB69_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB69_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5i64_scalar: ; VI: ; %bb.0: @@ -28894,70 +28490,30 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 @@ -28972,78 +28528,48 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: @@ -29142,141 +28668,91 @@ define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: .LBB71_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v5i64_to_v20f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 040f0c8b4d299..8026714f25992 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -667,28 +667,20 @@ define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB8_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2f16: @@ -754,24 +746,19 @@ define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_i32_to_v2f16_scalar: @@ -844,38 +831,31 @@ define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_4 -; SI-NEXT: .LBB10_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB10_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32: @@ -942,31 +922,30 @@ define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v2f16_to_i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32_scalar: ; VI: ; %bb.0: @@ -2957,28 +2936,20 @@ define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2f16: @@ -3042,27 +3013,25 @@ define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_f32_to_v2f16_scalar: ; VI: ; %bb.0: @@ -3135,38 +3104,31 @@ define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB30_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB30_4 -; SI-NEXT: .LBB30_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_2 -; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32: @@ -3233,31 +3195,30 @@ define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg % ; SI-LABEL: bitcast_v2f16_to_f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB31_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32_scalar: ; VI: ; %bb.0: @@ -4901,31 +4862,30 @@ define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2f16: @@ -4995,23 +4955,25 @@ define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_add_i32 s5, s6, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s7, s4, s6 +; SI-NEXT: s_and_b32 s6, s5, 0xffff ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v2i16_to_v2f16_scalar: @@ -5088,11 +5050,7 @@ define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5177,31 +5135,31 @@ define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v2i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: .LBB47_5: ; %end ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v2f16_to_v2i16_scalar: ; VI: ; %bb.0: @@ -6911,39 +6869,35 @@ define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2bf16: @@ -7010,36 +6964,36 @@ define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i ; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: s_cbranch_scc0 .LBB61_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_cbranch_execnz .LBB61_4 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_branch .LBB61_5 +; SI-NEXT: .LBB61_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB61_2 +; SI-NEXT: .LBB61_4: +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: .LBB61_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; VI: ; %bb.0: @@ -7117,41 +7071,34 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v1, v0, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2f16: @@ -7309,33 +7256,27 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s5, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB63_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v2bf16_to_v2f16_scalar: @@ -7511,38 +7452,31 @@ define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32: @@ -7609,31 +7543,30 @@ define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v1i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: s_cbranch_scc0 .LBB65_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB65_4 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB65_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB65_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 +; SI-NEXT: .LBB65_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32_scalar: ; VI: ; %bb.0: @@ -7713,28 +7646,20 @@ define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2f16: @@ -7800,24 +7725,19 @@ define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB67_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v1i32_to_v2f16_scalar: @@ -7889,15 +7809,12 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB68_3 @@ -7908,8 +7825,9 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -8071,22 +7989,19 @@ define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v2f16_to_v4i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: s_cbranch_scc0 .LBB69_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_lshr_b32 s8, s7, 8 +; SI-NEXT: s_bfe_u32 s9, s6, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB69_4 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 @@ -8095,13 +8010,18 @@ define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inr ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 -; SI-NEXT: .LBB69_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB69_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB69_2 +; SI-NEXT: .LBB69_4: +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v4i8_scalar: ; VI: ; %bb.0: @@ -8215,43 +8135,45 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2f16: @@ -8456,34 +8378,38 @@ define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inr ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff ; SI-NEXT: .LBB71_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v4i8_to_v2f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index e81978684b8b6..70ed2ca42b706 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1503,170 +1503,93 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: @@ -1765,154 +1688,99 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s13, s6 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s14, s4 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v11i32_to_v22f16_scalar: @@ -2038,61 +1906,28 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2105,28 +1940,50 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v22, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -2138,24 +1995,13 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2163,10 +2009,10 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2174,11 +2020,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2186,11 +2032,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2198,11 +2044,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2210,11 +2056,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2222,12 +2068,12 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -2360,92 +2206,59 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v11i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_lshr_b32 s5, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: s_lshr_b32 s5, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: s_lshr_b32 s5, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v21, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2453,10 +2266,10 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s47 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2464,11 +2277,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s46 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2476,11 +2289,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2488,11 +2301,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2500,11 +2313,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2512,29 +2325,41 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11i32_scalar: ; VI: ; %bb.0: @@ -3690,170 +3515,93 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: @@ -3944,157 +3692,115 @@ define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s27, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e64 v19, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s26, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v20, s42 +; SI-NEXT: v_mov_b32_e32 v19, s43 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v11f32_to_v22f16_scalar: ; VI: ; %bb.0: @@ -4237,61 +3943,28 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4304,28 +3977,50 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v22, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -4337,24 +4032,13 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4362,10 +4046,10 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4373,11 +4057,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4385,11 +4069,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4397,11 +4081,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4409,11 +4093,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4421,12 +4105,12 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4559,92 +4243,59 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-LABEL: bitcast_v22f16_to_v11f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_lshr_b32 s5, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: s_lshr_b32 s5, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: s_lshr_b32 s5, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v21, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4652,10 +4303,10 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s47 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4663,11 +4314,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s46 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4675,11 +4326,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4687,11 +4338,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4699,11 +4350,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4711,29 +4362,41 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11f32_scalar: ; VI: ; %bb.0: @@ -4910,66 +4573,76 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v24, v1, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v19, v1, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v23, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v16, v1, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v20, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v11, v1, v51 +; SI-NEXT: v_or_b32_e32 v17, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v26, v24, v33, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v36, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v48, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v50, 16 +; SI-NEXT: v_or_b32_e32 v32, v0, v34 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4981,111 +4654,112 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v37 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v26, v24, v29, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v23, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v20, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v32 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v22f16: @@ -5204,157 +4878,189 @@ define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i ; SI-LABEL: bitcast_v22i16_to_v22f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s28, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s24, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s73, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s20, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s17, 16 +; SI-NEXT: s_lshr_b32 s62, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s62, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s58, 16 +; SI-NEXT: s_or_b32 s12, s4, s14 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s28, s63, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s59, 16 +; SI-NEXT: s_or_b32 s10, s4, s28 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s60, 16 +; SI-NEXT: s_or_b32 s8, s4, s40 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s42, s73, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s7, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; SI-NEXT: s_mov_b32 s11, s29 +; SI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16 +; SI-NEXT: s_mov_b32 s9, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s61, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s60, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s72, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s59, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s63, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s58, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s62, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s27, s14, 0x30000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s57, s13, 16 +; SI-NEXT: s_lshr_b32 s58, s11, 16 +; SI-NEXT: s_lshr_b32 s59, s9, 16 +; SI-NEXT: s_lshr_b32 s60, s7, 16 +; SI-NEXT: s_lshr_b32 s61, s5, 16 +; SI-NEXT: s_lshr_b32 s56, s27, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s59, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s61, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: s_and_b32 s14, s27, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v22i16_to_v22f16_scalar: @@ -5536,62 +5242,18 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5599,131 +5261,131 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_or_b32_e32 v9, v9, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 ; SI-NEXT: v_alignbit_b32 v21, v1, v21, 16 ; SI-NEXT: v_alignbit_b32 v20, v3, v20, 16 ; SI-NEXT: v_alignbit_b32 v19, v5, v19, 16 ; SI-NEXT: v_alignbit_b32 v18, v7, v18, 16 -; SI-NEXT: v_alignbit_b32 v16, v9, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v9, v15, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v17 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5844,196 +5506,176 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s9, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_or_b32_e32 v23, v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v7, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s18 +; SI-NEXT: v_or_b32_e32 v25, v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 -; SI-NEXT: v_or_b32_e32 v29, v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 +; SI-NEXT: v_or_b32_e32 v30, v10, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v31, v11, v2 -; SI-NEXT: v_or_b32_e32 v30, v10, v4 +; SI-NEXT: v_or_b32_e32 v31, v10, v2 +; SI-NEXT: v_or_b32_e32 v29, v11, v4 ; SI-NEXT: v_or_b32_e32 v28, v12, v6 -; SI-NEXT: v_or_b32_e32 v26, v13, v8 +; SI-NEXT: v_or_b32_e32 v27, v13, v8 ; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 -; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v24, s9 +; SI-NEXT: v_mov_b32_e32 v21, s10 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v25, s26 +; SI-NEXT: v_mov_b32_e32 v27, s24 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v29, s20 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v16, s15 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: .LBB23_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 50dfbb9a5d234..60c5431f7e4c6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -2664,184 +2664,100 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: @@ -2943,167 +2859,107 @@ define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v12i32_to_v24f16_scalar: @@ -3234,66 +3090,30 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3306,30 +3126,54 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v31, v2 -; SI-NEXT: v_or_b32_e32 v3, v29, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 -; SI-NEXT: v_or_b32_e32 v6, v23, v6 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -3342,25 +3186,13 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3373,10 +3205,10 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3385,25 +3217,25 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3411,11 +3243,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3423,11 +3255,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3435,12 +3267,12 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3578,99 +3410,63 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v12i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3679,41 +3475,41 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s56 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3721,11 +3517,11 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s44 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3733,11 +3529,11 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3745,29 +3541,42 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12i32_scalar: ; VI: ; %bb.0: @@ -6098,184 +5907,100 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: @@ -6360,178 +6085,132 @@ cmp.false: end: %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x half> %phi -} - -define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB33_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 -; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v23, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s27, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16_scalar: ; VI: ; %bb.0: @@ -6677,66 +6356,30 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6749,30 +6392,54 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v31, v2 -; SI-NEXT: v_or_b32_e32 v3, v29, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 -; SI-NEXT: v_or_b32_e32 v6, v23, v6 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -6785,25 +6452,13 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6816,10 +6471,10 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -6828,25 +6483,25 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6854,11 +6509,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -6866,11 +6521,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -6878,12 +6533,12 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7021,99 +6676,63 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v12f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7122,41 +6741,41 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s56 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -7164,11 +6783,11 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s44 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7176,11 +6795,11 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -7188,29 +6807,42 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12f32_scalar: ; VI: ; %bb.0: @@ -8955,76 +8587,34 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -9033,94 +8623,58 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: @@ -9181,185 +8735,139 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <6 x double> %a1 to <24 x half> - br label %end - -cmp.false: - %a3 = bitcast <6 x double> %a to <24 x half> - br label %end - -end: - %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x half> %phi -} - -define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB45_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: s_cbranch_execnz .LBB45_3 -; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: s_branch .LBB45_5 +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16_scalar: ; VI: ; %bb.0: @@ -9487,66 +8995,30 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9559,42 +9031,42 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -9607,13 +9079,25 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9626,10 +9110,10 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9638,25 +9122,25 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9664,11 +9148,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9676,11 +9160,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9688,11 +9172,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -9831,142 +9315,118 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v6f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9974,11 +9434,11 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9986,11 +9446,11 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9998,12 +9458,12 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10016,11 +9476,41 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6f64_scalar: ; VI: ; %bb.0: @@ -11261,82 +10751,34 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -11353,92 +10795,56 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: @@ -11543,167 +10949,107 @@ define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s25, s27, 0 -; SI-NEXT: s_lshr_b32 s26, s24, 16 -; SI-NEXT: s_lshr_b32 s27, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v6i64_to_v24f16_scalar: @@ -11834,66 +11180,30 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11906,42 +11216,42 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -11954,13 +11264,25 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -11973,10 +11295,10 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -11985,25 +11307,25 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -12011,11 +11333,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12023,11 +11345,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -12035,11 +11357,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -12178,142 +11500,118 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-LABEL: bitcast_v24f16_to_v6i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -12321,11 +11619,11 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12333,11 +11631,11 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -12345,12 +11643,12 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -12363,11 +11661,41 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6i64_scalar: ; VI: ; %bb.0: @@ -12563,71 +11891,83 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v27, v1, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v25, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v35, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v1, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v30, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v19, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v26, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v16, v1, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v24, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v12, v1, v55 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v28, v27, v36, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v37, 16 +; SI-NEXT: v_alignbit_b32 v31, v22, v39, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v50, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v52, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v54, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -12640,120 +11980,121 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v50 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v8, v52, v8 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v55, v10 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v28, v27, v35, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v30, 16 +; SI-NEXT: v_alignbit_b32 v31, v22, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v21, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: @@ -12877,170 +12218,206 @@ define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i ; SI-LABEL: bitcast_v24i16_to_v24f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s29, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_lshr_b32 s79, s26, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s22, 16 +; SI-NEXT: s_lshr_b32 s62, s21, 16 +; SI-NEXT: s_lshr_b32 s76, s20, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s60, s17, 16 +; SI-NEXT: s_lshr_b32 s74, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s60, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s62, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s12, s74, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s63, 16 +; SI-NEXT: s_or_b32 s10, s4, s12 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s14, s75, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s72, 16 +; SI-NEXT: s_or_b32 s8, s4, s14 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s77, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_lshl_b32 s56, s79, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_or_b32 s14, s12, s46 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s9, s15 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_mov_b32 s15, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_mov_b32 s13, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s63, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s62, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s75, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s61, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s11, s16, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s60, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: s_lshr_b32 s62, s7, 16 +; SI-NEXT: s_lshr_b32 s63, s5, 16 +; SI-NEXT: s_lshr_b32 s72, s15, 16 +; SI-NEXT: s_lshr_b32 s73, s13, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s61, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s63, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s73, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v24i16_to_v24f16_scalar: @@ -13230,211 +12607,163 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v5, v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 ; SI-NEXT: v_or_b32_e32 v3, v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 ; SI-NEXT: v_alignbit_b32 v22, v3, v22, 16 ; SI-NEXT: v_alignbit_b32 v21, v5, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v7, v20, 16 -; SI-NEXT: v_alignbit_b32 v19, v9, v19, 16 -; SI-NEXT: v_alignbit_b32 v17, v11, v17, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v9, v18, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v15, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13560,213 +12889,191 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 ; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: s_lshr_b32 s7, s18, 16 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: s_lshr_b32 s29, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v7, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_or_b32_e32 v3, v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v34, v12, v0 -; SI-NEXT: v_or_b32_e32 v32, v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_or_b32_e32 v34, v13, v0 +; SI-NEXT: v_or_b32_e32 v32, v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v35, v13, v4 -; SI-NEXT: v_or_b32_e32 v33, v12, v6 -; SI-NEXT: v_or_b32_e32 v30, v14, v8 -; SI-NEXT: v_or_b32_e32 v28, v15, v10 +; SI-NEXT: v_or_b32_e32 v35, v12, v4 +; SI-NEXT: v_or_b32_e32 v33, v13, v6 +; SI-NEXT: v_or_b32_e32 v31, v14, v8 +; SI-NEXT: v_or_b32_e32 v30, v15, v10 ; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v28, s10 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v30, s26 +; SI-NEXT: v_mov_b32_e32 v31, s24 +; SI-NEXT: v_mov_b32_e32 v33, s22 +; SI-NEXT: v_mov_b32_e32 v35, s20 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v18, s15 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 95359d8ae8f72..8e5490d7eeafc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2978,212 +2978,114 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28f16: @@ -3292,196 +3194,126 @@ define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: s_branch .LBB17_2 -; -; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -3622,76 +3454,34 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3704,48 +3494,48 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -3760,13 +3550,27 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3774,10 +3578,10 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3785,11 +3589,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3797,11 +3601,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -3809,11 +3613,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -3821,11 +3625,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3833,11 +3637,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3845,11 +3649,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3857,11 +3661,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -4010,114 +3814,84 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v14i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4125,10 +3899,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4136,11 +3910,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4148,11 +3922,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4160,11 +3934,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4172,11 +3946,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4184,11 +3958,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4196,11 +3970,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -4208,12 +3982,12 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -4226,11 +4000,41 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_5 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB19_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14i32_scalar: ; VI: ; %bb.0: @@ -6825,212 +6629,114 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16: @@ -7130,196 +6836,142 @@ define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 ; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s29, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v22, s58 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16_scalar: ; VI: ; %bb.0: @@ -7473,76 +7125,34 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7555,48 +7165,48 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -7611,13 +7221,27 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7625,10 +7249,10 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7636,11 +7260,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7648,11 +7272,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -7660,11 +7284,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -7672,11 +7296,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7684,11 +7308,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7696,11 +7320,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7708,11 +7332,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -7861,114 +7485,84 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v14f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7976,10 +7570,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7987,11 +7581,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7999,11 +7593,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -8011,11 +7605,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -8023,11 +7617,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -8035,11 +7629,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -8047,11 +7641,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -8059,12 +7653,12 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -8077,11 +7671,41 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB35_5 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB35_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14f32_scalar: ; VI: ; %bb.0: @@ -10033,94 +9657,38 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -10139,106 +9707,64 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28f16: @@ -10351,193 +9877,123 @@ define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s25, s27, 0 -; SI-NEXT: s_lshr_b32 s26, s24, 16 -; SI-NEXT: s_lshr_b32 s27, s25, 16 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s40, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v7i64_to_v28f16_scalar: @@ -10681,76 +10137,34 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10763,48 +10177,48 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -10819,13 +10233,27 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10833,10 +10261,10 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10844,11 +10272,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10856,11 +10284,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10868,11 +10296,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10880,11 +10308,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -10892,11 +10320,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -10904,11 +10332,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10916,11 +10344,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -11069,114 +10497,84 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-LABEL: bitcast_v28f16_to_v7i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -11184,10 +10582,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11195,11 +10593,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11207,11 +10605,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -11219,11 +10617,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -11231,11 +10629,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -11243,11 +10641,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -11255,11 +10653,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -11267,12 +10665,12 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -11285,11 +10683,41 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7i64_scalar: ; VI: ; %bb.0: @@ -12635,87 +12063,38 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -12725,108 +12104,66 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16: @@ -12912,189 +12249,135 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: s_cbranch_scc0 .LBB53_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_4 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_branch .LBB53_5 +; SI-NEXT: .LBB53_3: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB53_2 +; SI-NEXT: .LBB53_4: +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v22, s58 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: .LBB53_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16_scalar: ; VI: ; %bb.0: @@ -13227,76 +12510,34 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -13309,48 +12550,48 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -13365,13 +12606,27 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -13379,10 +12634,10 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13390,11 +12645,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13402,11 +12657,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13414,11 +12669,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -13426,11 +12681,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -13438,11 +12693,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13450,11 +12705,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -13462,11 +12717,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -13615,114 +12870,84 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v7f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -13730,10 +12955,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13741,11 +12966,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13753,11 +12978,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13765,11 +12990,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -13777,11 +13002,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -13789,11 +13014,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13801,11 +13026,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -13813,12 +13038,12 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -13831,11 +13056,41 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7f64_scalar: ; VI: ; %bb.0: @@ -14047,6 +13302,20 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -14055,82 +13324,85 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v31, v1, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v1, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v49, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v27, v1, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v37, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v25, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v32, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v22, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v30, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v19, v1, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v14, v1, v47 +; SI-NEXT: v_or_b32_e32 v23, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v33, v31, v50, 16 +; SI-NEXT: v_alignbit_b32 v34, v29, v51, 16 +; SI-NEXT: v_alignbit_b32 v35, v27, v53, 16 +; SI-NEXT: v_alignbit_b32 v36, v25, v55, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v41, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v43, 16 +; SI-NEXT: v_alignbit_b32 v48, v14, v46, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v46 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -14145,80 +13417,95 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v50 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v10, v43, v10 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 +; SI-NEXT: v_or_b32_e32 v6, v55, v6 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v6, v42, v6 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v33, v31, v49, 16 +; SI-NEXT: v_alignbit_b32 v34, v29, v37, 16 +; SI-NEXT: v_alignbit_b32 v35, v27, v32, 16 +; SI-NEXT: v_alignbit_b32 v36, v25, v30, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v28, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v23, 16 +; SI-NEXT: v_alignbit_b32 v48, v14, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload @@ -14229,62 +13516,48 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14419,197 +13692,239 @@ define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i ; SI-LABEL: bitcast_v28i16_to_v28f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s95, s28, 16 +; SI-NEXT: s_lshr_b32 s79, s27, 16 +; SI-NEXT: s_lshr_b32 s94, s26, 16 +; SI-NEXT: s_lshr_b32 s78, s25, 16 +; SI-NEXT: s_lshr_b32 s93, s24, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s22, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b32 s91, s20, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s90, s18, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s89, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s74, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s89, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: s_or_b32 s12, s4, s14 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s91, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s8, s4, s46 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_lshl_b32 s60, s94, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s88, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s56 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s58, s93, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_lshl_b32 s62, s95, 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[40:41], 16 +; SI-NEXT: s_or_b32 s40, s14, s60 +; SI-NEXT: s_and_b32 s14, s28, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s58 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_mov_b32 s9, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s7, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s5, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s41, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_or_b32 s14, s14, s62 +; SI-NEXT: s_mov_b32 s15, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s78, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s77, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s91, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s76, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s90, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s75, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s89, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s9, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s5, 16 +; SI-NEXT: s_lshr_b32 s79, s41, 16 +; SI-NEXT: s_lshr_b32 s88, s15, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s77, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s41, 0xffff +; SI-NEXT: s_lshl_b32 s18, s79, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s18, s88, 16 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v28i16_to_v28f16_scalar: @@ -14818,243 +14133,187 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_or_b32_e32 v11, v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_or_b32_e32 v9, v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v5, v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 ; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: v_or_b32_e32 v21, v21, v26 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v23 -; SI-NEXT: v_or_b32_e32 v17, v17, v22 -; SI-NEXT: v_or_b32_e32 v15, v15, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 ; SI-NEXT: v_alignbit_b32 v27, v1, v27, 16 ; SI-NEXT: v_alignbit_b32 v26, v3, v26, 16 ; SI-NEXT: v_alignbit_b32 v25, v5, v25, 16 ; SI-NEXT: v_alignbit_b32 v24, v7, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v23, 16 -; SI-NEXT: v_alignbit_b32 v22, v11, v22, 16 -; SI-NEXT: v_alignbit_b32 v20, v13, v20, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v11, v20, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v18, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v21 -; SI-NEXT: v_or_b32_e32 v4, v4, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v18 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15190,178 +14449,124 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s12, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s13, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: s_lshr_b32 s7, s20, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s9, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_or_b32_e32 v11, v11, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_or_b32_e32 v49, v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 -; SI-NEXT: v_or_b32_e32 v48, v15, v2 -; SI-NEXT: v_or_b32_e32 v38, v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_or_b32_e32 v39, v15, v2 +; SI-NEXT: v_or_b32_e32 v37, v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -15370,14 +14575,12 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v39, v14, v6 -; SI-NEXT: v_or_b32_e32 v36, v15, v8 -; SI-NEXT: v_or_b32_e32 v34, v16, v10 -; SI-NEXT: v_or_b32_e32 v32, v17, v12 +; SI-NEXT: v_or_b32_e32 v48, v14, v6 +; SI-NEXT: v_or_b32_e32 v38, v15, v8 +; SI-NEXT: v_or_b32_e32 v36, v16, v10 +; SI-NEXT: v_or_b32_e32 v35, v17, v12 ; SI-NEXT: v_lshr_b64 v[26:27], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[4:5], 16 @@ -15385,52 +14588,82 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_lshr_b64 v[18:19], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[10:11], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v28, s15 +; SI-NEXT: v_mov_b32_e32 v33, s10 +; SI-NEXT: v_mov_b32_e32 v29, s11 +; SI-NEXT: v_mov_b32_e32 v30, s9 +; SI-NEXT: v_mov_b32_e32 v31, s8 +; SI-NEXT: v_mov_b32_e32 v32, s7 +; SI-NEXT: v_mov_b32_e32 v34, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v35, s28 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v37, s20 +; SI-NEXT: v_mov_b32_e32 v39, s18 +; SI-NEXT: v_mov_b32_e32 v49, s16 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v26, s43 +; SI-NEXT: v_mov_b32_e32 v24, s42 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v16, s13 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v39 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index 1bcc09a680b2a..547985e7ef4e3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -11,50 +11,45 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB0_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB0_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 ; SI-NEXT: .LBB0_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -255,42 +250,38 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s5, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s6 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: .LBB1_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_v3bf16_to_v3f16_scalar: @@ -513,49 +504,43 @@ define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: .LBB2_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -627,46 +612,45 @@ define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i ; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB3_4 ; SI-NEXT: .LBB3_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: .LBB3_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_branch .LBB3_5 +; SI-NEXT: .LBB3_3: +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: .LBB3_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_branch .LBB3_2 ; ; VI-LABEL: bitcast_v3f16_to_v3bf16_scalar: ; VI: ; %bb.0: @@ -1435,14 +1419,8 @@ define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -1535,37 +1513,36 @@ define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v3f16_to_v3i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_cbranch_scc0 .LBB9_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: s_cbranch_execnz .LBB9_4 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_branch .LBB9_5 +; SI-NEXT: .LBB9_3: +; SI-NEXT: s_branch .LBB9_2 +; SI-NEXT: .LBB9_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB9_5: ; %end ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v3f16_to_v3i16_scalar: ; VI: ; %bb.0: @@ -1649,38 +1626,38 @@ define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v3i16_to_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v4, v3, v5, 16 +; SI-NEXT: v_or_b32_e32 v2, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB10_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: .LBB10_4: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v4, v0, v2, 16 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3f16: @@ -1749,32 +1726,37 @@ define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v3i16_to_v3f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_mov_b32 s9, s17 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s17, 3 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_add_i32 s6, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: .LBB11_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_v3i16_to_v3f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 0625121f9ea7a..fd2fec386b6bf 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -3270,240 +3270,128 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32f16: @@ -3629,77 +3517,43 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s21, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v4 -; SI-NEXT: v_readfirstlane_b32 s19, v5 -; SI-NEXT: v_readfirstlane_b32 s18, v6 -; SI-NEXT: v_readfirstlane_b32 s17, v7 -; SI-NEXT: v_readfirstlane_b32 s16, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 @@ -3710,157 +3564,111 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: s_lshr_b32 s22, s19, 16 -; SI-NEXT: s_lshr_b32 s23, s18, 16 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s16, 16 -; SI-NEXT: s_lshr_b32 s26, s15, 16 -; SI-NEXT: s_lshr_b32 s27, s14, 16 -; SI-NEXT: s_lshr_b32 s28, s13, 16 -; SI-NEXT: s_lshr_b32 s29, s12, 16 -; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: s_lshr_b32 s41, s10, 16 -; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: s_lshr_b32 s43, s7, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v16i32_to_v32f16_scalar: @@ -4017,182 +3825,150 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB18_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -4200,10 +3976,10 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4211,11 +3987,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4223,11 +3999,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4235,11 +4011,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -4247,11 +4023,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4259,11 +4035,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -4271,11 +4047,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -4283,12 +4059,12 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -4449,248 +4225,216 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -18772,240 +18516,128 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: @@ -19109,236 +18741,142 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v55, s16 -; SI-NEXT: v_mov_b32_e32 v54, s17 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: v_mov_b32_e32 v52, s19 -; SI-NEXT: v_mov_b32_e32 v51, s20 -; SI-NEXT: v_mov_b32_e32 v50, s21 -; SI-NEXT: v_mov_b32_e32 v49, s22 -; SI-NEXT: v_mov_b32_e32 v48, s23 -; SI-NEXT: v_mov_b32_e32 v39, s24 -; SI-NEXT: v_mov_b32_e32 v38, s25 -; SI-NEXT: v_mov_b32_e32 v36, s26 -; SI-NEXT: v_mov_b32_e32 v35, s27 -; SI-NEXT: v_mov_b32_e32 v33, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v37, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v3, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v16f32_to_v32f16_scalar: @@ -19499,126 +19037,110 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -19635,30 +19157,14 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19671,10 +19177,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19682,10 +19188,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19693,11 +19199,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19705,11 +19211,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -19717,11 +19223,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -19729,11 +19235,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19741,11 +19247,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -19753,11 +19259,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -19765,12 +19271,12 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -19931,248 +19437,216 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -33786,106 +33260,42 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 @@ -33906,120 +33316,72 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: @@ -34149,238 +33511,158 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: v_readfirstlane_b32 s21, v4 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s16, v7 -; SI-NEXT: v_readfirstlane_b32 s17, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v12 -; SI-NEXT: v_readfirstlane_b32 s10, v13 -; SI-NEXT: v_readfirstlane_b32 s11, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s20, 3 -; SI-NEXT: s_addc_u32 s5, s21, 0 -; SI-NEXT: s_lshr_b32 s20, s4, 16 -; SI-NEXT: s_lshr_b32 s21, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s22, s18, 16 -; SI-NEXT: s_lshr_b32 s23, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s24, s16, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s26, s14, 16 -; SI-NEXT: s_lshr_b32 s27, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s28, s12, 16 -; SI-NEXT: s_lshr_b32 s29, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v8i64_to_v32f16_scalar: @@ -34537,126 +33819,110 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -34673,30 +33939,14 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -34709,10 +33959,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -34720,10 +33970,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -34731,11 +33981,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -34743,11 +33993,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -34755,11 +34005,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -34767,11 +34017,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -34779,11 +34029,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -34791,11 +34041,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -34803,12 +34053,12 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -34969,248 +34219,216 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -47986,98 +47204,42 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB76_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 @@ -48088,122 +47250,74 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: @@ -48291,228 +47405,134 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s19 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s21 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s23 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s25 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v4, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v31 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v8f64_to_v32f16_scalar: @@ -48649,126 +47669,110 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -48785,30 +47789,14 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB78_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -48821,10 +47809,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -48832,10 +47820,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -48843,11 +47831,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -48855,11 +47843,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -48867,11 +47855,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -48879,11 +47867,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -48891,11 +47879,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -48903,11 +47891,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -48915,12 +47903,12 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -49081,248 +48069,216 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -60811,6 +59767,22 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -60827,92 +59799,95 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v26 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v36, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v34, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v55, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v1, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v51, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v29, v1, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v39, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v26, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v35, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v23, v1, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v33, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v21, v1, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v30, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v16, v1, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_alignbit_b32 v37, v36, v40, 16 +; SI-NEXT: v_alignbit_b32 v38, v34, v41, 16 +; SI-NEXT: v_alignbit_b32 v48, v32, v43, 16 +; SI-NEXT: v_alignbit_b32 v49, v29, v45, 16 +; SI-NEXT: v_alignbit_b32 v50, v26, v47, 16 +; SI-NEXT: v_alignbit_b32 v52, v23, v57, 16 +; SI-NEXT: v_alignbit_b32 v53, v21, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v61, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v61 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -60929,90 +59904,107 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v60 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v58 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v42 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v10, v60, v10 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v37, v36, v55, 16 +; SI-NEXT: v_alignbit_b32 v38, v34, v51, 16 +; SI-NEXT: v_alignbit_b32 v48, v32, v39, 16 +; SI-NEXT: v_alignbit_b32 v49, v29, v35, 16 +; SI-NEXT: v_alignbit_b32 v50, v26, v33, 16 +; SI-NEXT: v_alignbit_b32 v52, v23, v30, 16 +; SI-NEXT: v_alignbit_b32 v53, v21, v27, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v24, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -61031,70 +60023,54 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -61239,223 +60215,299 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i ; SI-LABEL: bitcast_v32i16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s30, 0 +; SI-NEXT: v_writelane_b32 v16, s31, 1 +; SI-NEXT: v_writelane_b32 v16, s34, 2 +; SI-NEXT: v_writelane_b32 v16, s35, 3 +; SI-NEXT: v_writelane_b32 v16, s36, 4 +; SI-NEXT: v_writelane_b32 v16, s37, 5 +; SI-NEXT: v_writelane_b32 v16, s38, 6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s92, s29, 16 +; SI-NEXT: s_lshr_b32 s36, s28, 16 +; SI-NEXT: s_lshr_b32 s91, s27, 16 +; SI-NEXT: s_lshr_b32 s35, s26, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s24, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s30, s20, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s95, s18, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_writelane_b32 v16, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s37, v1 +; SI-NEXT: v_readfirstlane_b32 s38, v0 +; SI-NEXT: v_readfirstlane_b32 s93, v3 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v4 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s94, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s14, s4, s44 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s95, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s88, 16 +; SI-NEXT: s_or_b32 s12, s4, s46 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s89, 16 +; SI-NEXT: s_or_b32 s10, s4, s56 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s58, s31, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_or_b32 s8, s4, s58 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s60, s34, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s91, 16 +; SI-NEXT: s_or_b32 s6, s4, s60 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_lshl_b32 s40, s36, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s28, 0xffff +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_lshl_b32 s74, s39, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s38, 0xffff +; SI-NEXT: s_mov_b32 s15, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_mov_b32 s13, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s11, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s9, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s7, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s5, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_or_b32 s40, s40, s74 +; SI-NEXT: s_mov_b32 s41, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s37, s37, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s6, s91, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s25, 0xffff +; SI-NEXT: s_lshl_b32 s8, s90, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s10, s89, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s30, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s88, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s18, 0xffff +; SI-NEXT: s_lshl_b32 s13, s95, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s14, s79, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s16, 0xffff +; SI-NEXT: s_lshl_b32 s15, s94, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b32 s78, s15, 16 +; SI-NEXT: s_lshr_b32 s79, s13, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: s_lshr_b32 s90, s7, 16 +; SI-NEXT: s_lshr_b32 s91, s5, 16 +; SI-NEXT: s_lshr_b32 s92, s43, 16 +; SI-NEXT: s_lshr_b32 s93, s41, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v27 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s79, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s89, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s91, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s43, 0xffff +; SI-NEXT: s_lshl_b32 s18, s92, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s40, 0xffff +; SI-NEXT: s_lshl_b32 s19, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xffff +; SI-NEXT: s_lshl_b32 s20, s93, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_readlane_b32 s39, v16, 7 +; SI-NEXT: v_readlane_b32 s38, v16, 6 +; SI-NEXT: v_readlane_b32 s37, v16, 5 +; SI-NEXT: v_readlane_b32 s36, v16, 4 +; SI-NEXT: v_readlane_b32 s35, v16, 3 +; SI-NEXT: v_readlane_b32 s34, v16, 2 +; SI-NEXT: v_readlane_b32 s31, v16, 1 +; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: @@ -61710,276 +60762,212 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_or_b32_e32 v13, v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v9, v9, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v5, v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v24, v24, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_or_b32_e32 v18, v18, v26 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v17, v17, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 ; SI-NEXT: v_alignbit_b32 v31, v1, v31, 16 ; SI-NEXT: v_alignbit_b32 v30, v3, v30, 16 ; SI-NEXT: v_alignbit_b32 v29, v5, v29, 16 ; SI-NEXT: v_alignbit_b32 v28, v7, v28, 16 ; SI-NEXT: v_alignbit_b32 v27, v9, v27, 16 -; SI-NEXT: v_alignbit_b32 v26, v11, v26, 16 -; SI-NEXT: v_alignbit_b32 v25, v13, v25, 16 -; SI-NEXT: v_alignbit_b32 v23, v15, v23, 16 +; SI-NEXT: v_alignbit_b32 v25, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v22, v13, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v20, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_or_b32_e32 v6, v6, v21 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -62125,280 +61113,248 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s11, s26, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: s_lshr_b32 s7, s22, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s18, 16 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 -; SI-NEXT: v_or_b32_e32 v52, v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v11, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_or_b32_e32 v7, v17, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v19, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_or_b32_e32 v55, v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 ; SI-NEXT: v_or_b32_e32 v53, v16, v4 -; SI-NEXT: v_or_b32_e32 v49, v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_or_b32_e32 v50, v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_or_b32_e32 v51, v20, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v54, v17, v8 -; SI-NEXT: v_or_b32_e32 v50, v16, v10 -; SI-NEXT: v_or_b32_e32 v38, v18, v12 -; SI-NEXT: v_or_b32_e32 v36, v19, v14 -; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[16:17], v[14:15], 16 -; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v52, v16, v12 +; SI-NEXT: v_or_b32_e32 v49, v17, v10 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v48, v21, v8 +; SI-NEXT: v_or_b32_e32 v39, v20, v18 +; SI-NEXT: v_lshr_b64 v[30:31], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v0 +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v38, s6 +; SI-NEXT: v_mov_b32_e32 v14, s40 +; SI-NEXT: v_mov_b32_e32 v33, s14 +; SI-NEXT: v_mov_b32_e32 v34, s12 +; SI-NEXT: v_mov_b32_e32 v35, s9 +; SI-NEXT: v_mov_b32_e32 v36, s8 +; SI-NEXT: v_mov_b32_e32 v37, s7 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v39, s28 +; SI-NEXT: v_mov_b32_e32 v49, s26 +; SI-NEXT: v_mov_b32_e32 v52, s24 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s18 +; SI-NEXT: v_mov_b32_e32 v53, s16 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_mov_b32_e32 v16, s43 +; SI-NEXT: v_mov_b32_e32 v30, s42 +; SI-NEXT: v_mov_b32_e32 v28, s41 +; SI-NEXT: v_mov_b32_e32 v26, s15 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v22, s11 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v33 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_mov_b32_e32 v1, v16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; VI: ; %bb.0: @@ -75409,54 +74365,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -75473,131 +74381,109 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v63 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -75617,215 +74503,215 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v22 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v29 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v24 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v26 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v28 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v35 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v39 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v34 ; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v49 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 ; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v53 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v48 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v55 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v50 ; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v40 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v52 ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -75972,392 +74858,318 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 ; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: s_lshl_b32 s45, s6, 16 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 16 +; SI-NEXT: s_lshl_b32 s56, s18, 16 +; SI-NEXT: s_lshl_b32 s57, s8, 16 +; SI-NEXT: s_lshl_b32 s58, s19, 16 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_lshl_b32 s60, s20, 16 +; SI-NEXT: s_lshl_b32 s61, s10, 16 +; SI-NEXT: s_lshl_b32 s62, s21, 16 +; SI-NEXT: s_lshl_b32 s63, s11, 16 +; SI-NEXT: s_lshl_b32 s72, s22, 16 +; SI-NEXT: s_lshl_b32 s73, s12, 16 +; SI-NEXT: s_lshl_b32 s74, s23, 16 +; SI-NEXT: s_lshl_b32 s75, s13, 16 +; SI-NEXT: s_lshl_b32 s76, s24, 16 +; SI-NEXT: s_lshl_b32 s77, s14, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 16 +; SI-NEXT: s_lshl_b32 s79, s15, 16 +; SI-NEXT: s_lshl_b32 s88, s26, 16 +; SI-NEXT: s_lshl_b32 s89, s40, 16 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_lshl_b32 s91, s41, 16 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_lshl_b32 s93, s42, 16 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_lshl_b32 s95, s43, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v63 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 -; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v31 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v39 -; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v1, s95 +; SI-NEXT: v_mov_b32_e32 v0, s94 +; SI-NEXT: v_mov_b32_e32 v7, s93 +; SI-NEXT: v_mov_b32_e32 v6, s92 +; SI-NEXT: v_mov_b32_e32 v9, s91 +; SI-NEXT: v_mov_b32_e32 v8, s90 +; SI-NEXT: v_mov_b32_e32 v11, s89 +; SI-NEXT: v_mov_b32_e32 v10, s88 +; SI-NEXT: v_mov_b32_e32 v13, s79 +; SI-NEXT: v_mov_b32_e32 v12, s78 +; SI-NEXT: v_mov_b32_e32 v15, s77 +; SI-NEXT: v_mov_b32_e32 v14, s76 +; SI-NEXT: v_mov_b32_e32 v17, s75 +; SI-NEXT: v_mov_b32_e32 v16, s74 +; SI-NEXT: v_mov_b32_e32 v19, s73 +; SI-NEXT: v_mov_b32_e32 v18, s72 +; SI-NEXT: v_mov_b32_e32 v21, s63 +; SI-NEXT: v_mov_b32_e32 v20, s62 +; SI-NEXT: v_mov_b32_e32 v23, s61 +; SI-NEXT: v_mov_b32_e32 v22, s60 +; SI-NEXT: v_mov_b32_e32 v25, s59 +; SI-NEXT: v_mov_b32_e32 v24, s58 +; SI-NEXT: v_mov_b32_e32 v27, s57 +; SI-NEXT: v_mov_b32_e32 v26, s56 +; SI-NEXT: v_mov_b32_e32 v29, s47 +; SI-NEXT: v_mov_b32_e32 v28, s46 +; SI-NEXT: v_mov_b32_e32 v31, s45 +; SI-NEXT: v_mov_b32_e32 v30, s44 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[3:4], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[4:5], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[50:51], 16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; VI: ; %bb.0: @@ -76581,7 +75393,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -76600,309 +75412,271 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_alignbit_b32 v23, v1, v33, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_alignbit_b32 v24, v23, v0, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v35, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_alignbit_b32 v25, v22, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v37, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; SI-NEXT: v_alignbit_b32 v26, v21, v0, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v39, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_alignbit_b32 v27, v20, v0, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v49, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v50 +; SI-NEXT: v_alignbit_b32 v28, v19, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, v11, v51, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_alignbit_b32 v29, v17, v0, 16 +; SI-NEXT: v_alignbit_b32 v14, v13, v54, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_alignbit_b32 v30, v14, v0, 16 +; SI-NEXT: v_alignbit_b32 v16, v15, v45, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v32, v16, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v62, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v61, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v60, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v59, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v58, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v57, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v56, 16 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v6, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_alignbit_b32 v8, v8, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v10, v10, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v12, v12, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v18, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v16, v15, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v14, v13, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v17, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v19, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v20, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v21, v5, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v22, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v24, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v25, 16 +; SI-NEXT: v_alignbit_b32 v26, v21, v26, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v27, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v28, 16 +; SI-NEXT: v_alignbit_b32 v29, v17, v29, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v32, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -76921,70 +75695,54 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -78118,10 +76876,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s29, 16 ; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 @@ -78146,11 +76905,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -78168,295 +76927,431 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s42 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s41 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s40 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v12 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v46, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v47, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v50 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshr_b64 v[34:35], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v63 +; SI-NEXT: v_lshr_b64 v[21:22], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v22, v42 +; SI-NEXT: v_lshr_b64 v[41:42], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v28, v60 +; SI-NEXT: v_lshr_b64 v[31:32], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_mov_b32_e32 v29, v61 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_mov_b32_e32 v26, v59 +; SI-NEXT: v_mov_b32_e32 v6, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_mov_b32_e32 v27, v60 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v8, v58 +; SI-NEXT: v_mov_b32_e32 v9, v59 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v53 +; SI-NEXT: v_mov_b32_e32 v60, v58 +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v63 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_lshr_b64 v[10:11], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v11, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[43:44], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[40:41], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[26:27], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[24:25], v[36:37], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[57:58], 16 +; SI-NEXT: v_mov_b32_e32 v25, v18 +; SI-NEXT: v_mov_b32_e32 v18, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[55:56], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_lshr_b64 v[45:46], v[50:51], 16 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[28:29], 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[2:3], v[59:60], 16 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: v_mov_b32_e32 v41, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v22, v15 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_mov_b32_e32 v53, v18 +; SI-NEXT: v_mov_b32_e32 v18, v25 +; SI-NEXT: v_mov_b32_e32 v52, v2 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[60:61], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshr_b64 v[55:56], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[10:11], v[43:44], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[57:58], v[46:47], 16 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[17:18], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 +; SI-NEXT: v_lshr_b64 v[21:22], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[61:62], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v5, v60 +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[23:24], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshr_b64 v[34:35], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[16:17], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[56:57], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v47 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -78473,49 +77368,57 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: @@ -79886,97 +78789,49 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -79993,690 +78848,691 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: ; kill: killed $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_or_b32_e32 v54, v25, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_or_b32_e32 v50, v23, v9 -; SI-NEXT: v_alignbit_b32 v9, v50, v54, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v50, v54, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v58 +; SI-NEXT: v_or_b32_e32 v57, v1, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v45, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_or_b32_e32 v22, v28, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v21, v27, v9 -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v38, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v37, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v19, v31, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v30, v9 -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v35, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v36, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_or_b32_e32 v17, v34, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v33, v9 -; SI-NEXT: v_alignbit_b32 v9, v18, v17, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v18, v17, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 -; SI-NEXT: v_or_b32_e32 v16, v37, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_or_b32_e32 v15, v36, v9 -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 -; SI-NEXT: v_or_b32_e32 v13, v48, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v39, v9 -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 -; SI-NEXT: v_or_b32_e32 v11, v52, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_or_b32_e32 v12, v51, v9 -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_or_b32_e32 v10, v40, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 -; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 -; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v33, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v31, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v29, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v30, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v27, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v25, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v56, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v46, v24, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v19, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v18, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v63 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v25, v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v27, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v16, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v28, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_or_b32_e32 v29, v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v30, v11, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v27 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v31, v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v32, v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v22, v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_or_b32_e32 v21, v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v33, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v54, v24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_or_b32_e32 v50, v23, v24 -; SI-NEXT: v_alignbit_b32 v23, v50, v54, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v34, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v35, v5, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v36, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v38, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v37, v4, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v50, v54, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 -; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_bfe_u32 v56, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v46, v24, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v19, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v18, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v43 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_or_b32_e32 v8, v24, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v63 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v60 -; SI-NEXT: v_or_b32_e32 v8, v8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v58 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v61 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v59 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v47 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v50 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -81944,659 +80800,695 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s86, 30 +; SI-NEXT: v_writelane_b32 v18, s87, 31 +; SI-NEXT: v_writelane_b32 v18, s96, 32 +; SI-NEXT: v_writelane_b32 v18, s97, 33 +; SI-NEXT: v_writelane_b32 v18, s98, 34 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v40, s30, 0 -; SI-NEXT: v_writelane_b32 v40, s31, 1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_writelane_b32 v40, s34, 2 -; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: s_lshr_b32 s96, s29, 16 +; SI-NEXT: s_lshr_b32 s97, s28, 16 +; SI-NEXT: s_lshr_b32 s86, s27, 16 +; SI-NEXT: s_lshr_b32 s87, s26, 16 +; SI-NEXT: s_lshr_b32 s84, s25, 16 +; SI-NEXT: s_lshr_b32 s85, s24, 16 +; SI-NEXT: s_lshr_b32 s82, s23, 16 +; SI-NEXT: s_lshr_b32 s83, s22, 16 +; SI-NEXT: s_lshr_b32 s80, s21, 16 +; SI-NEXT: s_lshr_b32 s81, s20, 16 +; SI-NEXT: s_lshr_b32 s70, s19, 16 +; SI-NEXT: s_lshr_b32 s71, s18, 16 +; SI-NEXT: s_lshr_b32 s68, s17, 16 +; SI-NEXT: s_lshr_b32 s69, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s98, v2 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s99, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_writelane_b32 v40, s37, 5 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: v_readfirstlane_b32 s46, v5 +; SI-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 -; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v24 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v27 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v26 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v32 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 -; SI-NEXT: s_or_b32 s5, s21, s5 -; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s45, s19, 8 -; SI-NEXT: s_lshr_b32 s43, s17, 8 -; SI-NEXT: s_lshr_b32 s41, s15, 8 -; SI-NEXT: s_lshr_b32 s29, s13, 8 -; SI-NEXT: s_lshr_b32 s27, s11, 8 -; SI-NEXT: s_lshr_b32 s25, s9, 8 -; SI-NEXT: s_lshr_b32 s23, s7, 8 -; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v19, s4, 4 +; SI-NEXT: v_writelane_b32 v19, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v19, s4, 2 +; SI-NEXT: v_writelane_b32 v19, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v19, s4, 0 +; SI-NEXT: v_writelane_b32 v19, s5, 1 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v19, s4, 8 +; SI-NEXT: v_writelane_b32 v19, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v19, s4, 6 +; SI-NEXT: v_writelane_b32 v19, s5, 7 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s98, 0xffff +; SI-NEXT: s_lshl_b32 s45, s99, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; SI-NEXT: s_or_b32 s5, s5, s45 +; SI-NEXT: s_lshr_b32 s75, s43, 8 +; SI-NEXT: s_lshr_b32 s73, s41, 8 +; SI-NEXT: s_lshr_b32 s63, s15, 8 +; SI-NEXT: s_lshr_b32 s61, s13, 8 +; SI-NEXT: s_lshr_b32 s59, s11, 8 +; SI-NEXT: s_lshr_b32 s57, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: s_bfe_u32 s77, s68, 0x80008 +; SI-NEXT: s_bfe_u32 s79, s70, 0x80008 +; SI-NEXT: s_bfe_u32 s89, s80, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s82, 0x80008 +; SI-NEXT: s_bfe_u32 s93, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s56, s86, 0x80008 +; SI-NEXT: s_bfe_u32 s58, s96, 0x80008 +; SI-NEXT: s_bfe_u32 s60, s99, 0x80008 +; SI-NEXT: s_lshr_b64 s[64:65], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s99 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_readfirstlane_b32 s5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s98 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s97 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s96 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s87 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 ; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readfirstlane_b32 s8, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s86 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s85 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readfirstlane_b32 s10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s84 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s83 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s82 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 ; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s81 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readfirstlane_b32 s14, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s80 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 ; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s71 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s17, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readfirstlane_b32 s20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s70 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s19, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v9 -; SI-NEXT: v_readfirstlane_b32 s17, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v10 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s45, s19, 8 -; SI-NEXT: s_lshr_b32 s43, s17, 8 -; SI-NEXT: s_lshr_b32 s41, s15, 8 -; SI-NEXT: s_lshr_b32 s29, s13, 8 -; SI-NEXT: s_lshr_b32 s27, s11, 8 -; SI-NEXT: s_lshr_b32 s25, s9, 8 -; SI-NEXT: s_lshr_b32 s23, s7, 8 -; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 -; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s26, s26, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_or_b32 s18, s18, s26 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s20, s20, 24 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: s_lshl_b32 s19, s45, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v48 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v9, s18, v9 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s42, 8 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_and_b32 s18, s28, 0xff +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s69 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s24, 24 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_or_b32 s40, s20, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v9 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v10 +; SI-NEXT: s_or_b32 s41, s16, s18 +; SI-NEXT: v_readfirstlane_b32 s16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s68 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_or_b32 s42, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: s_or_b32 s43, s17, s16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s75, s43, 8 +; SI-NEXT: s_lshr_b32 s73, s41, 8 +; SI-NEXT: s_lshr_b32 s63, s15, 8 +; SI-NEXT: s_lshr_b32 s61, s13, 8 +; SI-NEXT: s_lshr_b32 s59, s11, 8 +; SI-NEXT: s_lshr_b32 s57, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: v_bfe_u32 v16, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v14, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v13, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v10, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v8, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v1, 8, 8 +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v19, s4, 0 +; SI-NEXT: v_writelane_b32 v19, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: v_writelane_b32 v19, s4, 2 +; SI-NEXT: v_writelane_b32 v19, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 4 +; SI-NEXT: v_writelane_b32 v19, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 6 +; SI-NEXT: v_writelane_b32 v19, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 8 +; SI-NEXT: v_writelane_b32 v19, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_readlane_b32 s18, v19, 4 +; SI-NEXT: v_readlane_b32 s20, v19, 2 +; SI-NEXT: v_readlane_b32 s22, v19, 8 +; SI-NEXT: v_readlane_b32 s24, v19, 0 +; SI-NEXT: v_readlane_b32 s26, v19, 6 +; SI-NEXT: v_mov_b32_e32 v1, s99 +; SI-NEXT: v_mov_b32_e32 v2, s96 +; SI-NEXT: v_mov_b32_e32 v3, s86 +; SI-NEXT: v_mov_b32_e32 v4, s84 +; SI-NEXT: v_mov_b32_e32 v5, s82 +; SI-NEXT: v_mov_b32_e32 v6, s80 +; SI-NEXT: v_mov_b32_e32 v9, s70 +; SI-NEXT: v_mov_b32_e32 v12, s68 +; SI-NEXT: v_mov_b32_e32 v7, s60 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v10, s56 +; SI-NEXT: v_mov_b32_e32 v11, s93 +; SI-NEXT: v_mov_b32_e32 v13, s91 +; SI-NEXT: v_mov_b32_e32 v14, s89 +; SI-NEXT: v_mov_b32_e32 v15, s79 +; SI-NEXT: v_mov_b32_e32 v16, s77 +; SI-NEXT: v_readlane_b32 s19, v19, 5 +; SI-NEXT: v_readlane_b32 s21, v19, 3 +; SI-NEXT: v_readlane_b32 s23, v19, 9 +; SI-NEXT: v_readlane_b32 s25, v19, 1 +; SI-NEXT: v_readlane_b32 s27, v19, 7 +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s24, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s20, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s75, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_or_b32_e32 v12, s16, v12 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s74, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s26, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s22, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s43, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s73, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: buffer_store_dword v16, v12, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v15 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v8, s16, v8 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v9, s16, v9 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_lshl_b32 s16, s76, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_and_b32 s16, s72, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s40, 24 +; SI-NEXT: s_lshl_b32 s17, s62, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v12, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s41, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s15, s63, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v14 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v7, s14, v7 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v6, s14, v6 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_lshl_b32 s14, s92, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s58, 0xff +; SI-NEXT: s_and_b32 s14, s90, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s46, 24 +; SI-NEXT: s_lshl_b32 s15, s78, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v9, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s29, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v6, s12, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: s_lshl_b32 s12, s66, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s72, 0xff +; SI-NEXT: s_and_b32 s12, s64, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s60, 24 +; SI-NEXT: s_lshl_b32 s13, s88, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s27, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s11, s59, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v11 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s10, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s90, 8 +; SI-NEXT: s_lshl_b32 s10, s54, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s78, 0xff +; SI-NEXT: s_and_b32 s10, s52, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: s_lshl_b32 s11, s50, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s25, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s57, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, s8, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s94, 8 +; SI-NEXT: s_lshl_b32 s8, s48, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s92, 0xff +; SI-NEXT: s_and_b32 s8, s38, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s88, 24 +; SI-NEXT: s_lshl_b32 s9, s36, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s23, 8 +; SI-NEXT: s_lshl_b32 s7, s47, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s36, 8 +; SI-NEXT: s_lshl_b32 s6, s34, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s34, 0xff +; SI-NEXT: s_and_b32 s6, s30, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s30, 24 +; SI-NEXT: s_lshl_b32 s7, s94, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s37, v40, 5 -; SI-NEXT: v_readlane_b32 s36, v40, 4 -; SI-NEXT: v_readlane_b32 s35, v40, 3 -; SI-NEXT: v_readlane_b32 s34, v40, 2 -; SI-NEXT: v_readlane_b32 s31, v40, 1 -; SI-NEXT: v_readlane_b32 s30, v40, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v18, 35 +; SI-NEXT: v_readlane_b32 s98, v18, 34 +; SI-NEXT: v_readlane_b32 s97, v18, 33 +; SI-NEXT: v_readlane_b32 s96, v18, 32 +; SI-NEXT: v_readlane_b32 s87, v18, 31 +; SI-NEXT: v_readlane_b32 s86, v18, 30 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; VI: ; %bb.0: @@ -83996,349 +82888,376 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v25 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v35 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v39, v46, v9 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v29, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v0, v29 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v39 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v36, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v23, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v0, v36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v17, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v3, v17, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v37, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v0, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v54, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v38, v42, v7 +; SI-NEXT: v_alignbit_b32 v53, v5, v19, 16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v38 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v25, v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v7, v25, 16 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v20, v57, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v59, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v26, v63, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v13, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v15, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v22, v11, v26, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v2, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v61, v17, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v47 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v43, v17, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 -; SI-NEXT: v_or_b32_e32 v11, v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v11, v11, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v11, v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v48, v17, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v17, v1, v8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -84352,325 +83271,352 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_or_b32_e32 v25, v10, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v12, v9, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v47 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v10, v60, v10 -; SI-NEXT: v_or_b32_e32 v11, v57, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v14, v44, v14 -; SI-NEXT: v_or_b32_e32 v15, v41, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v23, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v16, v40, v16 -; SI-NEXT: v_or_b32_e32 v17, v52, v17 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_or_b32_e32 v23, v39, v23 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v49, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_alignbit_b32 v21, v13, v19, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_alignbit_b32 v22, v11, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v9, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v7, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: .LBB106_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v53 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v53, v5, v61, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v1 +; SI-NEXT: v_alignbit_b32 v1, v23, v0, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v3, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -84687,26 +83633,26 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -86421,546 +85367,783 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v64i8_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v32, s30, 0 -; SI-NEXT: v_writelane_b32 v32, s31, 1 -; SI-NEXT: v_writelane_b32 v32, s34, 2 -; SI-NEXT: v_writelane_b32 v32, s35, 3 -; SI-NEXT: v_writelane_b32 v32, s36, 4 -; SI-NEXT: v_writelane_b32 v32, s37, 5 -; SI-NEXT: v_writelane_b32 v32, s38, 6 -; SI-NEXT: v_writelane_b32 v32, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s74, v30 -; SI-NEXT: v_readfirstlane_b32 s61, v29 -; SI-NEXT: v_readfirstlane_b32 s63, v28 -; SI-NEXT: v_readfirstlane_b32 s59, v27 -; SI-NEXT: v_readfirstlane_b32 s60, v26 -; SI-NEXT: v_readfirstlane_b32 s57, v25 -; SI-NEXT: v_readfirstlane_b32 s58, v24 -; SI-NEXT: v_readfirstlane_b32 s47, v23 -; SI-NEXT: v_readfirstlane_b32 s56, v22 -; SI-NEXT: v_readfirstlane_b32 s44, v21 -; SI-NEXT: v_readfirstlane_b32 s34, v19 -; SI-NEXT: v_readfirstlane_b32 s37, v18 -; SI-NEXT: v_readfirstlane_b32 s94, v17 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: s_mov_b32 s92, s16 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s21, 1 +; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s82, v30 +; SI-NEXT: v_readfirstlane_b32 s83, v28 +; SI-NEXT: v_readfirstlane_b32 s44, v27 +; SI-NEXT: v_readfirstlane_b32 s96, v26 +; SI-NEXT: v_readfirstlane_b32 s70, v25 +; SI-NEXT: v_readfirstlane_b32 s68, v24 +; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s65, v22 +; SI-NEXT: v_readfirstlane_b32 s86, v21 +; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s87, v19 +; SI-NEXT: v_readfirstlane_b32 s80, v18 +; SI-NEXT: v_readfirstlane_b32 s36, v17 ; SI-NEXT: v_readfirstlane_b32 s31, v16 -; SI-NEXT: v_readfirstlane_b32 s90, v15 -; SI-NEXT: v_readfirstlane_b32 s93, v14 -; SI-NEXT: v_readfirstlane_b32 s79, v13 -; SI-NEXT: v_readfirstlane_b32 s39, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s38, v10 -; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v14 +; SI-NEXT: v_readfirstlane_b32 s67, v13 +; SI-NEXT: v_readfirstlane_b32 s34, v12 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s37, v9 ; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s92, v7 -; SI-NEXT: v_readfirstlane_b32 s95, v6 -; SI-NEXT: v_readfirstlane_b32 s89, v5 -; SI-NEXT: v_readfirstlane_b32 s91, v4 -; SI-NEXT: v_readfirstlane_b32 s78, v3 -; SI-NEXT: v_readfirstlane_b32 s88, v2 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s77, v0 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s6, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s9, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s8, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s12, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s10, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v6 +; SI-NEXT: v_readfirstlane_b32 s51, v5 +; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s54, v2 +; SI-NEXT: v_readfirstlane_b32 s89, v1 +; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s91, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s93, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s85, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s99, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s9, v51 +; SI-NEXT: v_writelane_b32 v41, s58, 3 +; SI-NEXT: v_writelane_b32 v41, s9, 4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s13, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s43, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s42, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s62, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s91, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s36, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s13, s5, s6 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s7, s53, 8 +; SI-NEXT: s_or_b32 s14, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_or_b32 s8, s7, s5 +; SI-NEXT: s_and_b32 s5, s81, 0xff +; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s67, 24 +; SI-NEXT: s_or_b32 s10, s7, s5 +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: s_lshl_b32 s7, s87, 8 +; SI-NEXT: s_or_b32 s40, s5, s7 +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s86, 24 +; SI-NEXT: s_or_b32 s60, s7, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_or_b32 s42, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xff +; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 1 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_or_b32 s57, s9, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s89, 24 +; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_and_b32 s9, s94, 0xff +; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_writelane_b32 v41, s44, 11 +; SI-NEXT: s_lshl_b32 s44, s37, 24 +; SI-NEXT: s_or_b32 vcc_lo, s44, s11 +; SI-NEXT: s_and_b32 s11, s38, 0xff +; SI-NEXT: s_lshl_b32 s44, s64, 8 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: s_or_b32 vcc_hi, s45, s44 +; SI-NEXT: s_and_b32 s44, s65, 0xff +; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_mov_b32 s21, s46 +; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s97, 12 +; SI-NEXT: s_mov_b32 s97, s86 +; SI-NEXT: s_mov_b32 s86, s84 +; SI-NEXT: s_mov_b32 s84, s70 +; SI-NEXT: s_mov_b32 s70, s34 +; SI-NEXT: s_mov_b32 s34, s88 +; SI-NEXT: s_mov_b32 s88, s24 +; SI-NEXT: s_or_b32 s24, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: s_and_b32 s44, s82, 0xff +; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s69, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s78, 24 +; SI-NEXT: s_mov_b32 s95, s90 +; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_or_b32 s18, s46, s45 +; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s45 +; SI-NEXT: s_or_b32 s63, s44, s18 +; SI-NEXT: s_and_b32 s44, s98, 0xff +; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s85, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s56, 24 +; SI-NEXT: s_mov_b32 s76, s56 +; SI-NEXT: s_mov_b32 s56, s85 +; SI-NEXT: s_mov_b32 s85, s79 +; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_or_b32 s19, s46, s45 +; SI-NEXT: s_and_b32 s45, s99, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s72, s46, s45 +; SI-NEXT: s_or_b32 s73, s44, s19 +; SI-NEXT: s_and_b32 s44, s52, 0xff +; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s91, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_mov_b32 s47, s96 +; SI-NEXT: s_mov_b32 s96, s78 +; SI-NEXT: s_mov_b32 s78, s69 +; SI-NEXT: s_mov_b32 s69, s68 +; SI-NEXT: s_mov_b32 s68, s38 +; SI-NEXT: s_mov_b32 s38, s35 +; SI-NEXT: s_mov_b32 s35, s89 +; SI-NEXT: s_or_b32 s89, s46, s45 +; SI-NEXT: s_and_b32 s45, s50, 0xff +; SI-NEXT: s_or_b32 s5, s5, s57 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_mov_b32 s45, s83 +; SI-NEXT: s_mov_b32 s83, s91 +; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s28, s42, 0xffff +; SI-NEXT: s_mov_b32 s59, s94 +; SI-NEXT: s_mov_b32 s94, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_or_b32 s42, s12, s4 +; SI-NEXT: s_mov_b32 s43, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s9, s9, vcc_lo +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: s_or_b32 s11, s11, vcc_hi +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 +; SI-NEXT: s_or_b32 s7, s7, s77 +; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s58, s15, 0xffff +; SI-NEXT: s_mov_b32 s44, s82 +; SI-NEXT: s_mov_b32 s82, s81 +; SI-NEXT: s_mov_b32 s81, s55 +; SI-NEXT: s_mov_b32 s55, s54 +; SI-NEXT: s_mov_b32 s54, s51 +; SI-NEXT: s_mov_b32 s51, s37 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_mov_b32 s46, s98 +; SI-NEXT: s_mov_b32 s98, s93 +; SI-NEXT: s_and_b32 s93, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_or_b32 s40, s13, s6 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_or_b32 s14, s14, s8 +; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_or_b32 s12, s58, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_or_b32 s10, s16, s60 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_or_b32 s6, s28, s72 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_or_b32 s4, s27, s74 +; SI-NEXT: s_mov_b32 s5, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s16, s37 +; SI-NEXT: s_mov_b32 s37, s51 +; SI-NEXT: s_mov_b32 s51, s54 +; SI-NEXT: s_mov_b32 s54, s55 +; SI-NEXT: s_mov_b32 s55, s81 +; SI-NEXT: s_mov_b32 s81, s82 +; SI-NEXT: s_mov_b32 s82, s44 +; SI-NEXT: v_readlane_b32 s44, v41, 11 +; SI-NEXT: s_mov_b32 s93, s98 +; SI-NEXT: s_mov_b32 s98, s46 +; SI-NEXT: s_mov_b32 s46, s21 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: s_mov_b32 s28, s91 +; SI-NEXT: s_mov_b32 s91, s83 +; SI-NEXT: s_mov_b32 s83, s45 +; SI-NEXT: s_mov_b32 s27, s94 +; SI-NEXT: s_mov_b32 s94, s59 +; SI-NEXT: s_lshr_b32 s23, s57, 16 +; SI-NEXT: s_lshr_b32 s57, s77, 16 +; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s63, s24, 16 +; SI-NEXT: s_mov_b32 s24, s88 +; SI-NEXT: s_mov_b32 s88, s34 +; SI-NEXT: s_mov_b32 s34, s70 +; SI-NEXT: s_mov_b32 s70, s84 +; SI-NEXT: s_mov_b32 s84, s86 +; SI-NEXT: s_mov_b32 s86, s97 +; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_mov_b32 s18, s90 +; SI-NEXT: s_mov_b32 s90, s95 +; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_mov_b32 s19, s79 +; SI-NEXT: s_mov_b32 s79, s85 +; SI-NEXT: s_mov_b32 s85, s56 +; SI-NEXT: s_mov_b32 s56, s76 +; SI-NEXT: s_lshr_b32 s45, s89, 16 +; SI-NEXT: s_mov_b32 s89, s35 +; SI-NEXT: s_mov_b32 s35, s38 +; SI-NEXT: s_mov_b32 s38, s68 +; SI-NEXT: s_mov_b32 s68, s69 +; SI-NEXT: s_mov_b32 s69, s78 +; SI-NEXT: s_mov_b32 s78, s96 +; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b64 s[76:77], 0 +; SI-NEXT: s_branch .LBB107_3 +; SI-NEXT: .LBB107_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 s[76:77], -1 +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: .LBB107_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] +; SI-NEXT: v_readlane_b32 s76, v41, 5 +; SI-NEXT: v_readlane_b32 s77, v41, 6 +; SI-NEXT: s_mov_b32 s58, s76 +; SI-NEXT: v_readlane_b32 s76, v41, 7 +; SI-NEXT: v_readlane_b32 s77, v41, 8 +; SI-NEXT: s_cbranch_vccnz .LBB107_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s93, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s75, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s73, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_cbranch_execnz .LBB107_3 -; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s50, s50, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s11, 0xff -; SI-NEXT: s_lshl_b32 s6, s7, 8 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s6, s50, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: s_lshl_b32 s6, s93, 8 +; SI-NEXT: s_add_i32 s79, s16, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s8, 8 -; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v41, 4 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: s_and_b32 s6, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s15, 0xff -; SI-NEXT: s_lshl_b32 s8, s10, 8 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_and_b32 s8, s99, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s85, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s96, s96, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s41, 0xff -; SI-NEXT: s_lshl_b32 s9, s13, 8 -; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_and_b32 s8, s96, 0xff +; SI-NEXT: s_lshl_b32 s9, s44, 8 +; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s43, 0xff -; SI-NEXT: s_lshl_b32 s10, s14, 8 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: v_readlane_b32 s9, v41, 2 +; SI-NEXT: s_and_b32 s10, s83, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s82, 0xff +; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s45, 0xff -; SI-NEXT: s_lshl_b32 s11, s40, 8 -; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s80, 0xff +; SI-NEXT: s_lshl_b32 s11, s87, 8 +; SI-NEXT: s_add_i32 s66, s66, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s73, 0xff -; SI-NEXT: s_lshl_b32 s12, s42, 8 -; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_and_b32 s12, s66, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s86, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s65, 0xff +; SI-NEXT: s_lshl_b32 s12, s84, 8 +; SI-NEXT: s_add_i32 s52, s68, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_and_b32 s12, s75, 0xff -; SI-NEXT: s_lshl_b32 s13, s62, 8 -; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_and_b32 s13, s52, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s55, 0xff +; SI-NEXT: s_lshl_b32 s13, s71, 8 +; SI-NEXT: s_add_i32 s48, s34, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s13, s74, 0xff -; SI-NEXT: s_lshl_b32 s14, s72, 8 -; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s14, s48, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s67, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s38, 0xff +; SI-NEXT: s_lshl_b32 s14, s64, 8 +; SI-NEXT: s_add_i32 s31, s31, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s63, 0xff -; SI-NEXT: s_lshl_b32 s15, s61, 8 -; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_and_b32 s15, s31, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s14, s36, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s36, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_add_i32 s95, s88, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s60, 0xff -; SI-NEXT: s_lshl_b32 s40, s59, 8 -; SI-NEXT: s_add_i32 s58, s58, 3 -; SI-NEXT: s_or_b32 s15, s40, s15 -; SI-NEXT: s_and_b32 s40, s58, 0xff -; SI-NEXT: s_lshl_b32 s41, s57, 8 -; SI-NEXT: s_add_i32 s56, s56, 3 -; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: s_and_b32 s41, s56, 0xff -; SI-NEXT: s_lshl_b32 s42, s47, 8 -; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_and_b32 s42, s46, 0xff -; SI-NEXT: s_lshl_b32 s43, s44, 8 -; SI-NEXT: s_add_i32 s37, s37, 3 -; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: s_and_b32 s43, s37, 0xff -; SI-NEXT: s_lshl_b32 s44, s34, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_and_b32 s44, s31, 0xff -; SI-NEXT: s_lshl_b32 s45, s94, 8 -; SI-NEXT: s_add_i32 s93, s93, 3 -; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: s_and_b32 s45, s93, 0xff -; SI-NEXT: s_lshl_b32 s46, s90, 8 -; SI-NEXT: s_add_i32 s39, s39, 3 -; SI-NEXT: s_or_b32 s45, s46, s45 -; SI-NEXT: s_and_b32 s46, s39, 0xff -; SI-NEXT: s_lshl_b32 s47, s79, 8 -; SI-NEXT: s_add_i32 s38, s38, 3 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s38, 0xff -; SI-NEXT: s_lshl_b32 s56, s36, 8 -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: s_or_b32 s47, s56, s47 -; SI-NEXT: s_and_b32 s56, s35, 0xff -; SI-NEXT: s_lshl_b32 s57, s30, 8 -; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: s_or_b32 s56, s57, s56 -; SI-NEXT: s_and_b32 s57, s95, 0xff -; SI-NEXT: s_lshl_b32 s58, s92, 8 -; SI-NEXT: s_add_i32 s91, s91, 3 -; SI-NEXT: s_or_b32 s57, s58, s57 -; SI-NEXT: s_and_b32 s58, s91, 0xff -; SI-NEXT: s_lshl_b32 s59, s89, 8 -; SI-NEXT: s_add_i32 s88, s88, 3 -; SI-NEXT: s_or_b32 s58, s59, s58 -; SI-NEXT: s_and_b32 s59, s88, 0xff -; SI-NEXT: s_lshl_b32 s60, s78, 8 -; SI-NEXT: s_add_i32 s77, s77, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s21, s95, 0xff +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s94, 0xff +; SI-NEXT: s_lshl_b32 s21, s49, 8 +; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_lshl_b32 s21, s37, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s21, s24, 0xff +; SI-NEXT: s_lshl_b32 s16, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s16, s29, 8 +; SI-NEXT: s_lshl_b32 s23, s89, 24 +; SI-NEXT: s_add_i32 s89, s90, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s16, s89, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s23, s16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_add_i32 s41, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s59, s60, s59 -; SI-NEXT: s_and_b32 s60, s77, 0xff -; SI-NEXT: s_lshl_b32 s61, s76, 8 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: s_lshl_b32 s29, s29, 8 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_lshl_b32 s25, s25, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s23, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s19, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s20, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 1 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s60, s61, s60 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s18, s22, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 -; SI-NEXT: s_addk_i32 s40, 0x300 -; SI-NEXT: s_addk_i32 s41, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s43, 0x300 -; SI-NEXT: s_addk_i32 s44, 0x300 -; SI-NEXT: s_addk_i32 s45, 0x300 -; SI-NEXT: s_addk_i32 s46, 0x300 -; SI-NEXT: s_addk_i32 s47, 0x300 -; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_addk_i32 s57, 0x300 -; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_addk_i32 s59, 0x300 -; SI-NEXT: s_addk_i32 s60, 0x300 -; SI-NEXT: s_addk_i32 s28, 0x300 -; SI-NEXT: s_addk_i32 s26, 0x300 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_readlane_b32 s39, v32, 7 -; SI-NEXT: v_readlane_b32 s38, v32, 6 -; SI-NEXT: v_readlane_b32 s37, v32, 5 -; SI-NEXT: v_readlane_b32 s36, v32, 4 -; SI-NEXT: v_readlane_b32 s35, v32, 3 -; SI-NEXT: v_readlane_b32 s34, v32, 2 -; SI-NEXT: v_readlane_b32 s31, v32, 1 -; SI-NEXT: v_readlane_b32 s30, v32, 0 -; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 9 +; SI-NEXT: s_lshr_b32 s23, s43, 16 +; SI-NEXT: s_lshr_b32 s57, s41, 16 +; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s5, 16 +; SI-NEXT: v_writelane_b32 v41, s17, 10 +; SI-NEXT: .LBB107_5: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s43, 0xffff +; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s40, 0xffff +; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xffff +; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_or_b32 s14, s14, s20 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_or_b32 s15, s15, s20 +; SI-NEXT: v_readlane_b32 s20, v41, 9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s20, s61, 16 +; SI-NEXT: s_or_b32 s13, s13, s20 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s20, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s20 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s20, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s20 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s20, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s20 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s20, s73, 16 +; SI-NEXT: s_or_b32 s9, s9, s20 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s20 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s20, s75, 16 +; SI-NEXT: s_or_b32 s7, s7, s20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_readlane_b32 s21, v41, 10 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v64i8_to_v32f16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 36caff3752e26..361a93919fed7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -4722,286 +4722,142 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36f16: @@ -5410,85 +5266,47 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_readfirstlane_b32 s23, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s21, v7 -; SI-NEXT: v_readfirstlane_b32 s20, v8 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: v_readfirstlane_b32 s18, v10 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_readfirstlane_b32 s16, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v13 -; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_readfirstlane_b32 s13, v15 -; SI-NEXT: v_readfirstlane_b32 s12, v16 -; SI-NEXT: v_readfirstlane_b32 s11, v17 -; SI-NEXT: v_readfirstlane_b32 s10, v18 -; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_and_b64 s[22:23], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 @@ -5501,175 +5319,123 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: s_lshr_b32 s24, s21, 16 -; SI-NEXT: s_lshr_b32 s25, s20, 16 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s18, 16 -; SI-NEXT: s_lshr_b32 s28, s17, 16 -; SI-NEXT: s_lshr_b32 s29, s16, 16 -; SI-NEXT: s_lshr_b32 s40, s15, 16 -; SI-NEXT: s_lshr_b32 s41, s14, 16 -; SI-NEXT: s_lshr_b32 s42, s13, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s45, s10, 16 -; SI-NEXT: s_lshr_b32 s46, s8, 16 -; SI-NEXT: s_lshr_b32 s47, s7, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v2, v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: @@ -6152,68 +5918,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6230,101 +5935,102 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -6339,20 +6045,44 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -6361,10 +6091,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6373,14 +6103,15 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -6389,32 +6120,32 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6423,10 +6154,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -6434,29 +6165,30 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -6464,10 +6196,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -6475,40 +6207,38 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -7003,160 +6733,106 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v18i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7169,10 +6845,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7181,199 +6857,126 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v36f16_to_v18i32_scalar: @@ -11846,305 +11449,161 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v18f32_to_v36f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18f32_to_v36f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 @@ -12502,276 +11961,158 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, s16 -; SI-NEXT: v_mov_b32_e32 v44, s17 -; SI-NEXT: v_mov_b32_e32 v43, s18 -; SI-NEXT: v_mov_b32_e32 v42, s19 -; SI-NEXT: v_mov_b32_e32 v41, s20 -; SI-NEXT: v_mov_b32_e32 v40, s21 -; SI-NEXT: v_mov_b32_e32 v55, s22 -; SI-NEXT: v_mov_b32_e32 v54, s23 -; SI-NEXT: v_mov_b32_e32 v53, s24 -; SI-NEXT: v_mov_b32_e32 v52, s25 -; SI-NEXT: v_mov_b32_e32 v50, s26 -; SI-NEXT: v_mov_b32_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v48, s28 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v51, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: @@ -13354,68 +12695,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -13432,101 +12712,102 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -13541,20 +12822,44 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -13563,10 +12868,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -13575,14 +12880,15 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -13591,32 +12897,32 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -13625,10 +12931,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13636,29 +12942,30 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -13666,10 +12973,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -13677,40 +12984,38 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -14205,160 +13510,106 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v18f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14371,10 +13622,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -14383,199 +13634,126 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v36f16_to_v18f32_scalar: @@ -18207,127 +17385,46 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -18350,143 +17447,80 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36f16: @@ -18905,266 +17939,176 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_readfirstlane_b32 s22, v5 -; SI-NEXT: v_readfirstlane_b32 s23, v6 -; SI-NEXT: v_readfirstlane_b32 s20, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v9 -; SI-NEXT: v_readfirstlane_b32 s19, v10 -; SI-NEXT: v_readfirstlane_b32 s16, v11 -; SI-NEXT: v_readfirstlane_b32 s17, v12 -; SI-NEXT: v_readfirstlane_b32 s14, v13 -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_readfirstlane_b32 s12, v15 -; SI-NEXT: v_readfirstlane_b32 s13, v16 -; SI-NEXT: v_readfirstlane_b32 s10, v17 -; SI-NEXT: v_readfirstlane_b32 s11, v18 -; SI-NEXT: v_readfirstlane_b32 s7, v0 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_and_b64 s[22:23], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s22, 3 -; SI-NEXT: s_addc_u32 s5, s23, 0 -; SI-NEXT: s_lshr_b32 s22, s4, 16 -; SI-NEXT: s_lshr_b32 s23, s5, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s24, s20, 16 -; SI-NEXT: s_lshr_b32 s25, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s26, s18, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s28, s16, 16 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s40, s14, 16 -; SI-NEXT: s_lshr_b32 s41, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: s_lshr_b32 s43, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v2, v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: @@ -19647,68 +18591,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19725,101 +18608,102 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -19834,20 +18718,44 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -19856,10 +18764,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19868,14 +18776,15 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19884,32 +18793,32 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -19918,10 +18827,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19929,29 +18838,30 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -19959,10 +18869,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -19970,40 +18880,38 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -20498,160 +19406,106 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-LABEL: bitcast_v36f16_to_v9i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -20664,10 +19518,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20676,199 +19530,126 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v36f16_to_v9i64_scalar: @@ -23774,118 +22555,46 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -23897,145 +22606,82 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36f16: @@ -24394,274 +23040,149 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v17, s16 -; SI-NEXT: v_mov_b32_e32 v18, s17 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s21 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s23 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s25 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s27 -; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v6, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: @@ -25211,68 +23732,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25289,101 +23749,102 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -25398,20 +23859,44 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -25420,10 +23905,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -25432,14 +23917,15 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -25448,32 +23934,32 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -25482,10 +23968,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -25493,29 +23979,30 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -25523,10 +24010,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -25534,40 +24021,38 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -26062,160 +24547,106 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v9f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -26228,10 +24659,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26240,199 +24671,126 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v36f16_to_v9f64_scalar: @@ -26810,9 +25168,39 @@ end: define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -26829,123 +25217,116 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v48, v1, v57 +; SI-NEXT: v_alignbit_b32 v1, v48, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v39, v1, v58 +; SI-NEXT: v_alignbit_b32 v1, v39, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v37, v1, v60 +; SI-NEXT: v_alignbit_b32 v1, v37, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v34, v1, v62 +; SI-NEXT: v_alignbit_b32 v1, v34, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v31, v1, v50 +; SI-NEXT: v_alignbit_b32 v1, v31, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v28, v1, v52 +; SI-NEXT: v_alignbit_b32 v1, v28, v63, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v45, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v25, v1, v55 +; SI-NEXT: v_or_b32_e32 v42, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v25, v51, 16 +; SI-NEXT: v_or_b32_e32 v53, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v49, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v23, v1, v41 +; SI-NEXT: v_or_b32_e32 v38, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v1, v23, v54, 16 +; SI-NEXT: v_or_b32_e32 v36, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v32, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v18, v1, v43 +; SI-NEXT: v_or_b32_e32 v29, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_alignbit_b32 v44, v18, v40, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -26964,201 +25345,161 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v60, v4 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v0, v48, v45, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v39, v42, 16 +; SI-NEXT: v_or_b32_e32 v16, v40, v16 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v37, v53, 16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v52, v10 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v0, v34, v49, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v55, v12 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v31, v38, 16 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v34 +; SI-NEXT: v_alignbit_b32 v0, v28, v36, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_alignbit_b32 v0, v25, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -27175,12 +25516,46 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27548,249 +25923,351 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-LABEL: bitcast_v36i16_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s30, s29, 16 +; SI-NEXT: s_lshr_b32 s49, s28, 16 +; SI-NEXT: s_lshr_b32 s95, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s26, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s24, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s22, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s20, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s18, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s52, v3 +; SI-NEXT: v_readfirstlane_b32 s54, v2 +; SI-NEXT: v_readfirstlane_b32 s50, v1 +; SI-NEXT: v_readfirstlane_b32 s51, v0 +; SI-NEXT: v_readfirstlane_b32 s34, v5 +; SI-NEXT: v_readfirstlane_b32 s55, v6 +; SI-NEXT: v_readfirstlane_b32 s31, v7 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s53, v8 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v50 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s35, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s91, 16 +; SI-NEXT: s_or_b32 s40, s4, s46 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s36, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_or_b32 s14, s4, s56 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s58, s37, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: s_or_b32 s12, s4, s58 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s60, s38, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s94, 16 +; SI-NEXT: s_or_b32 s10, s4, s60 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s62, s39, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s95, 16 +; SI-NEXT: s_or_b32 s8, s4, s62 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s72, s48, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_or_b32 s6, s4, s72 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s50, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_lshl_b32 s42, s53, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s52, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s51, 0xffff +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_lshl_b32 s78, s55, 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s54, 0xffff +; SI-NEXT: s_mov_b32 s41, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s15, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s13, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_or_b32 s42, s42, s78 +; SI-NEXT: s_mov_b32 s43, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s54, s54, 3 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s52, s52, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s51, s51, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s30, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s26, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s95, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s39, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s94, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s38, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s93, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s13, s37, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s92, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s18, 0xffff +; SI-NEXT: s_lshl_b32 s15, s36, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s91, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: s_or_b32 s15, s18, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s35, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s90, s41, 16 +; SI-NEXT: s_lshr_b32 s91, s15, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s11, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s7, 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s45, 16 +; SI-NEXT: s_lshr_b32 s34, s43, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s41, 0xffff +; SI-NEXT: s_lshl_b32 s18, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s18, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s18, s91, 16 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s18, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s18, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s18, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s18, s93, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s18, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s18, s95, 16 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s18, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s18 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s18, s30, 16 +; SI-NEXT: s_or_b32 s5, s5, s18 +; SI-NEXT: s_and_b32 s18, s44, 0xffff +; SI-NEXT: s_lshl_b32 s19, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s45, 0xffff +; SI-NEXT: s_lshl_b32 s20, s31, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s42, 0xffff +; SI-NEXT: s_lshl_b32 s21, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s43, 0xffff +; SI-NEXT: s_lshl_b32 s22, s34, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s5 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v36i16_to_v36f16_scalar: @@ -28385,310 +26862,238 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v17, v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_or_b32_e32 v13, v13, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_or_b32_e32 v11, v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_or_b32_e32 v9, v9, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v36 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: v_or_b32_e32 v22, v22, v30 -; SI-NEXT: v_or_b32_e32 v20, v20, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 ; SI-NEXT: v_alignbit_b32 v35, v1, v35, 16 ; SI-NEXT: v_alignbit_b32 v34, v3, v34, 16 ; SI-NEXT: v_alignbit_b32 v33, v5, v33, 16 ; SI-NEXT: v_alignbit_b32 v32, v7, v32, 16 ; SI-NEXT: v_alignbit_b32 v31, v9, v31, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v30, 16 -; SI-NEXT: v_alignbit_b32 v29, v13, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v17, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v11, v29, 16 +; SI-NEXT: v_alignbit_b32 v27, v13, v27, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v24, 16 +; SI-NEXT: v_alignbit_b32 v22, v17, v22, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_or_b32_e32 v12, v12, v20 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: v_or_b32_e32 v4, v4, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29057,328 +27462,289 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s8, s26, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s20, 16 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: s_lshr_b32 s13, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_or_b32_e32 v11, v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_or_b32_e32 v13, v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v44, v18, v0 -; SI-NEXT: v_or_b32_e32 v42, v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v15, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_or_b32_e32 v7, v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v23, v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_or_b32_e32 v3, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v40, v19, v6 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_or_b32_e32 v43, v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v45 -; SI-NEXT: v_or_b32_e32 v55, v18, v6 -; SI-NEXT: v_or_b32_e32 v52, v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v19, v4 +; SI-NEXT: v_or_b32_e32 v42, v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v45, v19, v10 -; SI-NEXT: v_or_b32_e32 v40, v18, v12 -; SI-NEXT: v_or_b32_e32 v53, v20, v14 -; SI-NEXT: v_or_b32_e32 v50, v21, v16 -; SI-NEXT: v_lshr_b64 v[34:35], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[16:17], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v55, v20, v12 +; SI-NEXT: v_or_b32_e32 v53, v21, v10 +; SI-NEXT: v_or_b32_e32 v41, v14, v15 +; SI-NEXT: v_or_b32_e32 v54, v19, v22 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[2:3], 16 +; SI-NEXT: v_or_b32_e32 v17, v17, v0 +; SI-NEXT: v_or_b32_e32 v18, v18, v2 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v52, s7 +; SI-NEXT: v_mov_b32_e32 v38, s40 +; SI-NEXT: v_mov_b32_e32 v39, s14 +; SI-NEXT: v_mov_b32_e32 v48, s11 +; SI-NEXT: v_mov_b32_e32 v49, s10 +; SI-NEXT: v_mov_b32_e32 v50, s9 +; SI-NEXT: v_mov_b32_e32 v51, s8 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: v_mov_b32_e32 v53, s24 +; SI-NEXT: v_mov_b32_e32 v55, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, s18 +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v34, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v32, s41 +; SI-NEXT: v_mov_b32_e32 v30, s15 +; SI-NEXT: v_mov_b32_e32 v28, s13 +; SI-NEXT: v_mov_b32_e32 v26, s12 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v49 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index ce06af35bf4f0..9896de3fe8c5e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -5140,327 +5140,156 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40f16: @@ -5901,95 +5730,53 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_readfirstlane_b32 s22, v7 ; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_readfirstlane_b32 s22, v10 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s20, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v18 -; SI-NEXT: v_readfirstlane_b32 s13, v19 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v0 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v5 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_and_b64 s[24:25], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -6004,193 +5791,135 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: s_lshr_b32 s27, s22, 16 -; SI-NEXT: s_lshr_b32 s28, s21, 16 -; SI-NEXT: s_lshr_b32 s29, s20, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: s_lshr_b32 s45, s14, 16 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s11, 16 -; SI-NEXT: s_lshr_b32 s57, s10, 16 -; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s59, s7, 16 -; SI-NEXT: s_lshr_b32 s60, s6, 16 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: @@ -6745,86 +6474,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6841,103 +6491,112 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -6950,25 +6609,50 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -6982,6 +6666,13 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -6990,10 +6681,10 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7006,14 +6697,10 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7022,143 +6709,149 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -7714,175 +7407,116 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7895,11 +7529,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7908,239 +7541,142 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v40f16_to_v20i32_scalar: @@ -12986,327 +12522,156 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40f16: @@ -13715,314 +13080,174 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, s16 -; SI-NEXT: v_mov_b32_e32 v58, s17 -; SI-NEXT: v_mov_b32_e32 v57, s18 -; SI-NEXT: v_mov_b32_e32 v56, s19 -; SI-NEXT: v_mov_b32_e32 v47, s20 -; SI-NEXT: v_mov_b32_e32 v46, s21 -; SI-NEXT: v_mov_b32_e32 v45, s22 -; SI-NEXT: v_mov_b32_e32 v44, s23 -; SI-NEXT: v_mov_b32_e32 v43, s24 -; SI-NEXT: v_mov_b32_e32 v42, s25 -; SI-NEXT: v_mov_b32_e32 v40, s26 -; SI-NEXT: v_mov_b32_e32 v55, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v41, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v6, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v47 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: @@ -14619,86 +13844,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -14715,103 +13861,112 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -14824,25 +13979,50 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -14856,6 +14036,13 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -14864,10 +14051,10 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14880,14 +14067,10 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -14896,143 +14079,149 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -15588,175 +14777,116 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-LABEL: bitcast_v40f16_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -15769,11 +14899,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -15782,239 +14911,142 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v40f16_to_v20f32_scalar: @@ -20162,146 +19194,50 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -20326,163 +19262,88 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40f16: @@ -20933,296 +19794,196 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_readfirstlane_b32 s22, v7 ; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s22, v9 -; SI-NEXT: v_readfirstlane_b32 s23, v10 -; SI-NEXT: v_readfirstlane_b32 s20, v11 -; SI-NEXT: v_readfirstlane_b32 s21, v12 -; SI-NEXT: v_readfirstlane_b32 s18, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_readfirstlane_b32 s14, v17 -; SI-NEXT: v_readfirstlane_b32 s15, v18 -; SI-NEXT: v_readfirstlane_b32 s12, v19 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_and_b64 s[24:25], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s25, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s25, s5, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s26, s22, 16 -; SI-NEXT: s_lshr_b32 s27, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s28, s20, 16 -; SI-NEXT: s_lshr_b32 s29, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s42, s16, 16 -; SI-NEXT: s_lshr_b32 s43, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s46, s12, 16 -; SI-NEXT: s_lshr_b32 s47, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s58, s7, 16 -; SI-NEXT: s_lshr_b32 s59, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s60, s6, 16 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: @@ -21777,86 +20538,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -21873,103 +20555,112 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -21982,25 +20673,50 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -22014,6 +20730,13 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -22022,10 +20745,10 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22038,14 +20761,10 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22054,143 +20773,149 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -22746,175 +21471,116 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22927,11 +21593,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22940,239 +21605,142 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v40f16_to_v10i64_scalar: @@ -26548,136 +25116,50 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -26687,168 +25169,93 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: .LBB52_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40f16: @@ -27237,309 +25644,164 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: @@ -28116,86 +26378,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -28212,103 +26395,112 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -28321,25 +26513,50 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -28353,6 +26570,13 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -28361,10 +26585,10 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28377,14 +26601,10 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28393,143 +26613,149 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -29085,175 +27311,116 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-LABEL: bitcast_v40f16_to_v10f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -29266,11 +27433,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -29279,239 +27445,142 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v40f16_to_v10f64_scalar: @@ -30018,17 +28087,57 @@ end: define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30045,132 +28154,141 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v25 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v53, v1, v63 +; SI-NEXT: v_alignbit_b32 v1, v53, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v51, v1, v40 +; SI-NEXT: v_alignbit_b32 v1, v51, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v49, v1, v42 +; SI-NEXT: v_alignbit_b32 v1, v49, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v38, v1, v45 +; SI-NEXT: v_alignbit_b32 v1, v38, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v36, v1, v46 +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v59, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v1, v57 +; SI-NEXT: v_or_b32_e32 v56, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v33, v44, 16 +; SI-NEXT: v_or_b32_e32 v43, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v55, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v30, v1, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_alignbit_b32 v1, v30, v47, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v27, v1, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v1, v27, v58, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v25, v1, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_alignbit_b32 v1, v25, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_or_b32_e32 v20, v1, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_alignbit_b32 v1, v20, v39, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -30178,281 +28296,214 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v39, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v52, v18 +; SI-NEXT: v_or_b32_e32 v16, v32, v16 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: v_or_b32_e32 v14, v58, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v10, v44, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v54, v6 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v42, v4 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v0, v53, v59, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_alignbit_b32 v0, v51, v56, 16 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v49, v43, 16 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_alignbit_b32 v0, v38, v55, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v36, v10, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v30, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v27, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v25, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v42 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v49 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30469,13 +28520,44 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v40f16: @@ -30874,286 +28956,403 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-LABEL: bitcast_v40i16_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s54, s28, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s53, s26, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s52, s24, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s22, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s50, s20, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s49, s18, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s48, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s69, v5 +; SI-NEXT: v_readfirstlane_b32 s70, v4 +; SI-NEXT: v_readfirstlane_b32 s65, v3 +; SI-NEXT: v_readfirstlane_b32 s67, v2 +; SI-NEXT: v_readfirstlane_b32 s55, v1 +; SI-NEXT: v_readfirstlane_b32 s64, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v8 +; SI-NEXT: v_readfirstlane_b32 s38, v9 +; SI-NEXT: v_readfirstlane_b32 s68, v10 +; SI-NEXT: v_readfirstlane_b32 s37, v11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s66, v12 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v42 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s94, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s95, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: s_and_b32 s5, s55, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s65, 0xffff +; SI-NEXT: s_lshl_b32 s45, s38, 16 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s5, s5, s45 +; SI-NEXT: s_and_b32 s45, s69, 0xffff +; SI-NEXT: s_lshl_b32 s56, s39, 16 +; SI-NEXT: s_lshl_b32 s42, s49, 16 +; SI-NEXT: s_or_b32 s91, s45, s56 +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_lshr_b64 s[56:57], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s50, 16 +; SI-NEXT: s_or_b32 s46, s46, s42 +; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s20, 0xffff +; SI-NEXT: s_lshl_b32 s14, s51, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s52, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s10, s53, 16 +; SI-NEXT: s_or_b32 s14, s14, s12 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_lshl_b32 s8, s54, 16 +; SI-NEXT: s_or_b32 s12, s12, s10 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s6, s66, 16 +; SI-NEXT: s_or_b32 s10, s10, s8 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s64, 0xffff +; SI-NEXT: s_lshl_b32 s4, s68, 16 +; SI-NEXT: s_or_b32 s8, s8, s6 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 +; SI-NEXT: s_and_b32 s6, s67, 0xffff +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_mov_b32 s45, s47 +; SI-NEXT: s_mov_b32 s47, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_mov_b32 s15, s13 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s7, s5 +; SI-NEXT: s_or_b32 s4, s4, s90 +; SI-NEXT: s_mov_b32 s5, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_add_i32 s69, s69, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s69, 0xffff +; SI-NEXT: s_lshl_b32 s6, s39, 16 +; SI-NEXT: s_add_i32 s67, s67, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s67, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s65, 0xffff +; SI-NEXT: s_lshl_b32 s8, s38, 16 +; SI-NEXT: s_add_i32 s64, s64, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s64, 0xffff +; SI-NEXT: s_lshl_b32 s9, s66, 16 +; SI-NEXT: s_add_i32 s55, s55, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s55, 0xffff +; SI-NEXT: s_lshl_b32 s10, s37, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s54, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s29, 0xffff +; SI-NEXT: s_lshl_b32 s12, s36, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_lshl_b32 s13, s53, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: s_lshl_b32 s14, s35, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s15, s52, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s24, s34, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s15, s24, s15 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s51, 16 +; SI-NEXT: s_or_b32 s22, s24, s22 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s40, s22, 0x30000 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s31, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s41, s22, 0x30000 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s50, 16 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s42, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s30, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s43, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s49, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s46, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s95, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: s_add_i32 s47, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s48, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s94, s45, 16 +; SI-NEXT: s_lshr_b32 s95, s47, 16 +; SI-NEXT: s_lshr_b32 s30, s43, 16 +; SI-NEXT: s_lshr_b32 s31, s41, 16 +; SI-NEXT: s_lshr_b32 s34, s15, 16 +; SI-NEXT: s_lshr_b32 s35, s13, 16 +; SI-NEXT: s_lshr_b32 s36, s11, 16 +; SI-NEXT: s_lshr_b32 s37, s9, 16 +; SI-NEXT: s_lshr_b32 s38, s7, 16 +; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v51 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v35 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s46, 0xffff +; SI-NEXT: s_lshl_b32 s19, s58, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s47, 0xffff +; SI-NEXT: s_lshl_b32 s20, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s42, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s43, 0xffff +; SI-NEXT: s_lshl_b32 s22, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s40, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s41, 0xffff +; SI-NEXT: s_lshl_b32 s24, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s24 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s24, s34, 16 +; SI-NEXT: s_or_b32 s15, s15, s24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s24, s74, 16 +; SI-NEXT: s_or_b32 s12, s12, s24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s24, s35, 16 +; SI-NEXT: s_or_b32 s13, s13, s24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s24, s76, 16 +; SI-NEXT: s_or_b32 s10, s10, s24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s24, s36, 16 +; SI-NEXT: s_or_b32 s11, s11, s24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s24, s78, 16 +; SI-NEXT: s_or_b32 s8, s8, s24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s24, s37, 16 +; SI-NEXT: s_or_b32 s9, s9, s24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s24, s88, 16 +; SI-NEXT: s_or_b32 s6, s6, s24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s24, s38, 16 +; SI-NEXT: s_or_b32 s7, s7, s24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s90, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s39, 16 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: @@ -31803,344 +30002,264 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v25, v25, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v28, v28, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v38 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 ; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v27, v27, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v33 -; SI-NEXT: v_or_b32_e32 v24, v24, v50 -; SI-NEXT: v_or_b32_e32 v16, v16, v31 -; SI-NEXT: v_or_b32_e32 v21, v21, v51 -; SI-NEXT: v_alignbit_b32 v48, v2, v20, 16 -; SI-NEXT: v_alignbit_b32 v39, v30, v39, 16 -; SI-NEXT: v_alignbit_b32 v38, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v28, v36, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v49, 16 -; SI-NEXT: v_alignbit_b32 v35, v12, v34, 16 -; SI-NEXT: v_alignbit_b32 v34, v25, v33, 16 -; SI-NEXT: v_alignbit_b32 v33, v23, v50, 16 -; SI-NEXT: v_alignbit_b32 v32, v18, v31, 16 -; SI-NEXT: v_alignbit_b32 v31, v22, v51, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_alignbit_b32 v39, v1, v39, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v38, 16 +; SI-NEXT: v_alignbit_b32 v37, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v36, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v35, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v32, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v29, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v26, 16 +; SI-NEXT: v_alignbit_b32 v24, v19, v24, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 ; SI-NEXT: v_or_b32_e32 v4, v4, v20 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v11, v11, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 ; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v20 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 ; SI-NEXT: v_or_b32_e32 v16, v16, v20 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38 ; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v40i16: @@ -32540,374 +30659,326 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 ; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: s_lshr_b32 s15, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v13 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v18 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_or_b32_e32 v53, v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_or_b32_e32 v13, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v54, v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_or_b32_e32 v44, v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v43, v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_or_b32_e32 v11, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_or_b32_e32 v28, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v42, v16, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v47, v15, v6 +; SI-NEXT: v_or_b32_e32 v45, v16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v58, v16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 -; SI-NEXT: v_or_b32_e32 v46, v21, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v55 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v24 -; SI-NEXT: v_or_b32_e32 v41, v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v55, v21, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v59, v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v42, v20, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v44, v21, v17 +; SI-NEXT: v_or_b32_e32 v41, v22, v25 +; SI-NEXT: v_or_b32_e32 v43, v15, v27 +; SI-NEXT: v_or_b32_e32 v19, v16, v2 +; SI-NEXT: v_lshr_b64 v[21:22], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v14, v14, v0 +; SI-NEXT: v_or_b32_e32 v20, v20, v4 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v40, s7 +; SI-NEXT: v_mov_b32_e32 v55, s8 +; SI-NEXT: v_mov_b32_e32 v50, s40 +; SI-NEXT: v_mov_b32_e32 v51, s15 +; SI-NEXT: v_mov_b32_e32 v52, s11 +; SI-NEXT: v_mov_b32_e32 v53, s10 +; SI-NEXT: v_mov_b32_e32 v54, s9 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v43, s28 +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v44, s24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v45, s22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, s20 +; SI-NEXT: v_mov_b32_e32 v42, s18 +; SI-NEXT: v_mov_b32_e32 v46, s16 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v26, s27 +; SI-NEXT: v_mov_b32_e32 v28, s29 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v37, s42 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v35, s14 +; SI-NEXT: v_mov_b32_e32 v33, s13 +; SI-NEXT: v_mov_b32_e32 v31, s12 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v21, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v42 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v3, v23 +; SI-NEXT: v_mov_b32_e32 v5, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 8a0d00ea6164f..b074de310729d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1387,44 +1387,28 @@ define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4f16: @@ -1494,37 +1478,27 @@ define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_i64_to_v4f16_scalar: @@ -1600,16 +1574,10 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1622,21 +1590,23 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1721,46 +1691,43 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v4f16_to_i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_i64_scalar: ; VI: ; %bb.0: @@ -4644,42 +4611,27 @@ define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16: @@ -4744,39 +4696,33 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16_scalar: ; VI: ; %bb.0: @@ -4850,16 +4796,10 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4872,21 +4812,23 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4971,46 +4913,43 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; SI-LABEL: bitcast_v4f16_to_f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_f64_scalar: ; VI: ; %bb.0: @@ -7587,44 +7526,28 @@ define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4f16: @@ -7693,37 +7616,27 @@ define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s16, 3 -; SI-NEXT: s_add_i32 s6, s17, 3 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: @@ -7799,16 +7712,10 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7821,21 +7728,23 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7920,46 +7829,43 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; VI: ; %bb.0: @@ -10208,44 +10114,28 @@ define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16: @@ -10311,40 +10201,34 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: ; VI: ; %bb.0: @@ -10421,16 +10305,10 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10443,21 +10321,23 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10542,46 +10422,43 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; SI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; VI: ; %bb.0: @@ -12439,47 +12316,48 @@ define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v1, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v4, v6, 16 +; SI-NEXT: v_or_b32_e32 v2, v0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4f16: @@ -12550,40 +12428,46 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v4i16_to_v4f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s13, s5, s6 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s10, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v4i16_to_v4f16_scalar: @@ -12670,14 +12554,6 @@ define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12780,39 +12656,39 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s7, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -12820,8 +12696,6 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; VI: ; %bb.0: @@ -14769,62 +14643,54 @@ define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4bf16: @@ -14896,57 +14762,55 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s8, s16, 16 +; SI-NEXT: s_lshl_b32 s9, s6, 16 +; SI-NEXT: s_lshl_b32 s10, s17, 16 +; SI-NEXT: s_lshl_b32 s11, s7, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; VI: ; %bb.0: @@ -15035,63 +14899,56 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v5, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v3, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_alignbit_b32 v0, v0, v7, 16 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v3, v2, v3, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4f16: @@ -15319,53 +15176,46 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[7:8], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: @@ -15617,16 +15467,10 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -15645,10 +15489,12 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -15862,64 +15708,68 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_bfe_u32 s9, s14, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_or_b32_e32 v10, v2, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 ; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: .LBB105_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v9 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; VI: ; %bb.0: @@ -16083,72 +15933,78 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v7, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4f16: @@ -16440,60 +16296,68 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s10, s6, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s11, s5, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s7, s19, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s17, 8 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v8i8_to_v4f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 6e2167edd97cd..d3fd1ab06c1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -5540,393 +5540,192 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v22i32_to_v44f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 @@ -6381,103 +6180,57 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v9, s27 ; SI-NEXT: v_readfirstlane_b32 s25, v10 ; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: v_readfirstlane_b32 s22, v11 ; SI-NEXT: v_mov_b32_e32 v11, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s23, v13 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v15 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s17, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 -; SI-NEXT: v_readfirstlane_b32 s14, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 ; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: v_readfirstlane_b32 s6, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -6492,217 +6245,153 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s28, s26, 16 -; SI-NEXT: s_lshr_b32 s29, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s23, 16 -; SI-NEXT: s_lshr_b32 s41, s22, 16 -; SI-NEXT: s_lshr_b32 s42, s21, 16 -; SI-NEXT: s_lshr_b32 s43, s20, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s18, 16 -; SI-NEXT: s_lshr_b32 s46, s17, 16 -; SI-NEXT: s_lshr_b32 s47, s16, 16 -; SI-NEXT: s_lshr_b32 s56, s15, 16 -; SI-NEXT: s_lshr_b32 s57, s14, 16 -; SI-NEXT: s_lshr_b32 s58, s13, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s11, 16 -; SI-NEXT: s_lshr_b32 s61, s10, 16 -; SI-NEXT: s_lshr_b32 s62, s8, 16 -; SI-NEXT: s_lshr_b32 s63, s7, 16 -; SI-NEXT: s_lshr_b32 s72, s6, 16 -; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v14, v29, v14 -; SI-NEXT: v_or_b32_e32 v16, v27, v16 -; SI-NEXT: v_or_b32_e32 v18, v25, v18 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: s_branch .LBB17_2 -; -; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 ; VI-NEXT: v_mov_b32_e32 v10, s17 ; VI-NEXT: v_mov_b32_e32 v11, s18 ; VI-NEXT: v_mov_b32_e32 v12, s19 @@ -7296,57 +6985,6 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7363,142 +7001,137 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -7508,45 +7141,58 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -7576,10 +7222,10 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7592,10 +7238,10 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7604,168 +7250,170 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -8384,196 +8032,126 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8586,167 +8164,160 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -8754,111 +8325,9 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: @@ -14098,371 +13567,170 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44f16: @@ -14903,361 +14171,190 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s18 -; SI-NEXT: v_mov_b32_e32 v62, s19 -; SI-NEXT: v_mov_b32_e32 v60, s20 -; SI-NEXT: v_mov_b32_e32 v59, s21 -; SI-NEXT: v_mov_b32_e32 v58, s22 -; SI-NEXT: v_mov_b32_e32 v47, s23 -; SI-NEXT: v_mov_b32_e32 v44, s24 -; SI-NEXT: v_mov_b32_e32 v43, s25 -; SI-NEXT: v_mov_b32_e32 v42, s26 -; SI-NEXT: v_mov_b32_e32 v56, s27 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v57, s28 -; SI-NEXT: v_mov_b32_e32 v45, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v21 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v31 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: @@ -15906,57 +15003,6 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -15973,142 +15019,137 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -16118,45 +15159,58 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -16186,10 +15240,10 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16202,10 +15256,10 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -16214,168 +15268,170 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -16994,196 +16050,126 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-LABEL: bitcast_v44f16_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -17196,167 +16182,160 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -17364,111 +16343,9 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: @@ -21934,198 +20811,87 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %end -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB43_2 - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <44 x i16> %a, splat (i16 3) - %a2 = bitcast <44 x i16> %a1 to <11 x i64> - br label %end - -cmp.false: - %a3 = bitcast <44 x i16> %a to <11 x i64> - br label %end - -end: - %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x i64> %phi -} - -define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { -; SI-LABEL: bitcast_v11i64_to_v44f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -22150,188 +20916,98 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44f16: @@ -22818,324 +21494,214 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_readfirstlane_b32 s24, v9 ; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_readfirstlane_b32 s26, v10 +; SI-NEXT: v_readfirstlane_b32 s25, v10 ; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_readfirstlane_b32 s22, v11 ; SI-NEXT: v_mov_b32_e32 v11, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s22, v13 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s20, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s18, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v19 -; SI-NEXT: v_readfirstlane_b32 s17, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s6, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s26, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s28, s25, 16 -; SI-NEXT: s_lshr_b32 s29, s27, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s40, s22, 16 -; SI-NEXT: s_lshr_b32 s41, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s42, s20, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s44, s18, 16 -; SI-NEXT: s_lshr_b32 s45, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s46, s16, 16 -; SI-NEXT: s_lshr_b32 s47, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s58, s12, 16 -; SI-NEXT: s_lshr_b32 s59, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s60, s10, 16 -; SI-NEXT: s_lshr_b32 s61, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s62, s7, 16 -; SI-NEXT: s_lshr_b32 s63, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s72, s6, 16 -; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v14, v29, v14 -; SI-NEXT: v_or_b32_e32 v16, v27, v16 -; SI-NEXT: v_or_b32_e32 v18, v25, v18 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: @@ -23735,57 +22301,6 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23802,142 +22317,137 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -23947,45 +22457,58 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -24015,10 +22538,10 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -24031,10 +22554,10 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -24043,168 +22566,170 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -24823,196 +23348,126 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -25025,167 +23480,160 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -25193,111 +23641,9 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: @@ -28984,154 +27330,54 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -29145,188 +27391,98 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44f16: @@ -29745,354 +27901,185 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_mov_b32_e32 v21, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v53 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: @@ -30710,65 +28697,14 @@ cmp.false: br label %end end: - %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <44 x half> %phi -} - -define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { -; SI-LABEL: bitcast_v44f16_to_v11f64: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + +define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v44f16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30785,142 +28721,137 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -30930,45 +28861,58 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -30998,10 +28942,10 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -31014,10 +28958,10 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -31026,168 +28970,170 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -31806,369 +29752,292 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-LABEL: bitcast_v44f16_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -32176,111 +30045,9 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: @@ -32827,7 +30594,14 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -32867,34 +30641,26 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -32913,441 +30679,403 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v28 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v42, v1, v22 +; SI-NEXT: v_alignbit_b32 v1, v42, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v40, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v40, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v54, v1, v62 +; SI-NEXT: v_alignbit_b32 v1, v54, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v52, v1, v32 +; SI-NEXT: v_alignbit_b32 v1, v52, v59, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_or_b32_e32 v49, v1, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v1, v49, v61, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v38, v1, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_alignbit_b32 v1, v38, v43, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v35, v1, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v33, v1, v45 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_alignbit_b32 v1, v33, v50, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v30, v1, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_alignbit_b32 v1, v30, v55, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v28, v1, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_alignbit_b32 v1, v28, v58, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v22, v1, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v1, v22, v44, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v44, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v20, v60, v20 +; SI-NEXT: v_or_b32_e32 v18, v58, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v16, v55, v16 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v12, v36, v12 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v10, v43, v10 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v10, v53, v10 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v42, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v40, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v54, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v35, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v30, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33364,11 +31092,60 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v44f16: @@ -33799,328 +31576,451 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-LABEL: bitcast_v44i16_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s30, 0 +; SI-NEXT: v_writelane_b32 v22, s31, 1 +; SI-NEXT: v_writelane_b32 v22, s34, 2 +; SI-NEXT: v_writelane_b32 v22, s35, 3 +; SI-NEXT: v_writelane_b32 v22, s36, 4 +; SI-NEXT: v_writelane_b32 v22, s37, 5 +; SI-NEXT: v_writelane_b32 v22, s38, 6 +; SI-NEXT: v_writelane_b32 v22, s39, 7 +; SI-NEXT: v_writelane_b32 v22, s48, 8 +; SI-NEXT: v_writelane_b32 v22, s49, 9 +; SI-NEXT: v_writelane_b32 v22, s50, 10 +; SI-NEXT: v_writelane_b32 v22, s51, 11 +; SI-NEXT: v_writelane_b32 v22, s52, 12 +; SI-NEXT: v_writelane_b32 v22, s53, 13 +; SI-NEXT: v_writelane_b32 v22, s54, 14 +; SI-NEXT: v_writelane_b32 v22, s55, 15 +; SI-NEXT: v_writelane_b32 v22, s64, 16 +; SI-NEXT: v_writelane_b32 v22, s65, 17 +; SI-NEXT: v_writelane_b32 v22, s66, 18 +; SI-NEXT: v_writelane_b32 v22, s67, 19 +; SI-NEXT: v_writelane_b32 v22, s68, 20 +; SI-NEXT: v_writelane_b32 v22, s69, 21 +; SI-NEXT: v_writelane_b32 v22, s70, 22 +; SI-NEXT: v_writelane_b32 v22, s71, 23 +; SI-NEXT: v_writelane_b32 v22, s80, 24 +; SI-NEXT: v_writelane_b32 v22, s81, 25 +; SI-NEXT: v_writelane_b32 v22, s82, 26 +; SI-NEXT: v_writelane_b32 v22, s83, 27 +; SI-NEXT: v_writelane_b32 v22, s84, 28 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s38, s29, 16 +; SI-NEXT: s_lshr_b32 s65, s28, 16 +; SI-NEXT: s_lshr_b32 s37, s27, 16 +; SI-NEXT: s_lshr_b32 s64, s26, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s55, s24, 16 +; SI-NEXT: s_lshr_b32 s35, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s22, 16 +; SI-NEXT: s_lshr_b32 s34, s21, 16 +; SI-NEXT: s_lshr_b32 s53, s20, 16 +; SI-NEXT: s_lshr_b32 s31, s19, 16 +; SI-NEXT: s_lshr_b32 s52, s18, 16 +; SI-NEXT: s_lshr_b32 s30, s17, 16 +; SI-NEXT: s_lshr_b32 s51, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_writelane_b32 v22, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s82, v7 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s71, v5 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s68, v3 +; SI-NEXT: v_readfirstlane_b32 s70, v2 +; SI-NEXT: v_readfirstlane_b32 s66, v1 +; SI-NEXT: v_readfirstlane_b32 s67, v0 +; SI-NEXT: v_readfirstlane_b32 s50, v9 +; SI-NEXT: v_readfirstlane_b32 s85, v10 +; SI-NEXT: v_readfirstlane_b32 s49, v11 +; SI-NEXT: v_readfirstlane_b32 s83, v12 +; SI-NEXT: v_readfirstlane_b32 s48, v13 +; SI-NEXT: v_readfirstlane_b32 s80, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v15 +; SI-NEXT: v_readfirstlane_b32 s69, v16 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v58 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s58, s51, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s12, s4, s58 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s60, s52, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_or_b32 s10, s4, s60 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s62, s53, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s8, s4, s62 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s72, s54, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_or_b32 s6, s4, s72 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s38, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s26, 0xffff +; SI-NEXT: s_lshl_b32 s44, s65, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s66, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_lshr_b64 s[76:77], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s28, 0xffff +; SI-NEXT: s_lshl_b32 s42, s69, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s68, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s67, 0xffff +; SI-NEXT: s_lshl_b32 s40, s80, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s70, 0xffff +; SI-NEXT: s_lshl_b32 s14, s83, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s82, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s81, 0xffff +; SI-NEXT: s_or_b32 s95, s5, s7 +; SI-NEXT: s_lshl_b32 s94, s85, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s84, 0xffff +; SI-NEXT: s_mov_b32 s13, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s5, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_or_b32 s14, s14, s94 +; SI-NEXT: s_mov_b32 s15, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[94:95], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_and_b32 s4, s84, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s50, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s68, s68, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s68, 0xffff +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s67, s67, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s66, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s37, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s36, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s35, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s53, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s34, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s52, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s31, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s51, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s11, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s5, 16 +; SI-NEXT: s_lshr_b32 s37, s57, 16 +; SI-NEXT: s_lshr_b32 s38, s47, 16 +; SI-NEXT: s_lshr_b32 s39, s45, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b32 s49, s41, 16 +; SI-NEXT: s_lshr_b32 s50, s15, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v36 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s35, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s36, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s56, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s57, 0xffff +; SI-NEXT: s_lshl_b32 s18, s37, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s46, 0xffff +; SI-NEXT: s_lshl_b32 s19, s78, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s47, 0xffff +; SI-NEXT: s_lshl_b32 s20, s38, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s44, 0xffff +; SI-NEXT: s_lshl_b32 s21, s88, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s45, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s42, 0xffff +; SI-NEXT: s_lshl_b32 s23, s90, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s43, 0xffff +; SI-NEXT: s_lshl_b32 s24, s48, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s40, 0xffff +; SI-NEXT: s_lshl_b32 s25, s92, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s41, 0xffff +; SI-NEXT: s_lshl_b32 s26, s49, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s26, s94, 16 +; SI-NEXT: s_or_b32 s14, s14, s26 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s26, s50, 16 +; SI-NEXT: s_or_b32 s15, s15, s26 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_readlane_b32 s85, v22, 29 +; SI-NEXT: v_readlane_b32 s84, v22, 28 +; SI-NEXT: v_readlane_b32 s83, v22, 27 +; SI-NEXT: v_readlane_b32 s82, v22, 26 +; SI-NEXT: v_readlane_b32 s81, v22, 25 +; SI-NEXT: v_readlane_b32 s80, v22, 24 +; SI-NEXT: v_readlane_b32 s71, v22, 23 +; SI-NEXT: v_readlane_b32 s70, v22, 22 +; SI-NEXT: v_readlane_b32 s69, v22, 21 +; SI-NEXT: v_readlane_b32 s68, v22, 20 +; SI-NEXT: v_readlane_b32 s67, v22, 19 +; SI-NEXT: v_readlane_b32 s66, v22, 18 +; SI-NEXT: v_readlane_b32 s65, v22, 17 +; SI-NEXT: v_readlane_b32 s64, v22, 16 +; SI-NEXT: v_readlane_b32 s55, v22, 15 +; SI-NEXT: v_readlane_b32 s54, v22, 14 +; SI-NEXT: v_readlane_b32 s53, v22, 13 +; SI-NEXT: v_readlane_b32 s52, v22, 12 +; SI-NEXT: v_readlane_b32 s51, v22, 11 +; SI-NEXT: v_readlane_b32 s50, v22, 10 +; SI-NEXT: v_readlane_b32 s49, v22, 9 +; SI-NEXT: v_readlane_b32 s48, v22, 8 +; SI-NEXT: v_readlane_b32 s39, v22, 7 +; SI-NEXT: v_readlane_b32 s38, v22, 6 +; SI-NEXT: v_readlane_b32 s37, v22, 5 +; SI-NEXT: v_readlane_b32 s36, v22, 4 +; SI-NEXT: v_readlane_b32 s35, v22, 3 +; SI-NEXT: v_readlane_b32 s34, v22, 2 +; SI-NEXT: v_readlane_b32 s31, v22, 1 +; SI-NEXT: v_readlane_b32 s30, v22, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: @@ -34826,376 +32726,288 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_or_b32_e32 v21, v21, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_or_b32_e32 v19, v19, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v17, v17, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v15, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_or_b32_e32 v13, v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_or_b32_e32 v9, v9, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v5, v5, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v50 -; SI-NEXT: v_or_b32_e32 v32, v32, v49 -; SI-NEXT: v_or_b32_e32 v31, v31, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v39 -; SI-NEXT: v_or_b32_e32 v28, v28, v38 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_alignbit_b32 v51, v1, v51, 16 -; SI-NEXT: v_alignbit_b32 v50, v3, v50, 16 -; SI-NEXT: v_alignbit_b32 v49, v5, v49, 16 -; SI-NEXT: v_alignbit_b32 v48, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v39, v9, v39, 16 -; SI-NEXT: v_alignbit_b32 v38, v11, v38, 16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v37 +; SI-NEXT: v_or_b32_e32 v14, v14, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_or_b32_e32 v20, v20, v26 +; SI-NEXT: v_alignbit_b32 v22, v1, v22, 16 +; SI-NEXT: v_alignbit_b32 v51, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v50, 16 +; SI-NEXT: v_alignbit_b32 v49, v7, v49, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v39, 16 ; SI-NEXT: v_alignbit_b32 v37, v13, v37, 16 -; SI-NEXT: v_alignbit_b32 v36, v15, v36, 16 -; SI-NEXT: v_alignbit_b32 v35, v17, v35, 16 -; SI-NEXT: v_alignbit_b32 v34, v19, v34, 16 -; SI-NEXT: v_alignbit_b32 v29, v21, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v15, v35, 16 +; SI-NEXT: v_alignbit_b32 v32, v17, v32, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v29, 16 +; SI-NEXT: v_alignbit_b32 v26, v21, v26, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v11, v11, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v6, v6, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v30 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v12, v12, v27 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_or_b32_e32 v20, v20, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35603,441 +33415,391 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <44 x half> %a, splat (half 0xH0200) - %a2 = bitcast <44 x half> %a1 to <44 x i16> - br label %end - -cmp.false: - %a3 = bitcast <44 x half> %a to <44 x i16> - br label %end - -end: - %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <44 x i16> %phi -} - -define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + +define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_or_b32_e32 v23, v2, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v31, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v9, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v24 -; SI-NEXT: v_or_b32_e32 v44, v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_or_b32_e32 v11, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v56, v4, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v27, v22, v4 -; SI-NEXT: v_or_b32_e32 v56, v24, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v63 -; SI-NEXT: v_or_b32_e32 v26, v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v60, v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v58, v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v43, v22, v10 -; SI-NEXT: v_or_b32_e32 v63, v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v60, v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_or_b32_e32 v57, v15, v22 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_or_b32_e32 v7, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v62, v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_or_b32_e32 v61, v16, v30 +; SI-NEXT: v_or_b32_e32 v59, v14, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v62, v22, v16 -; SI-NEXT: v_or_b32_e32 v58, v24, v18 -; SI-NEXT: v_or_b32_e32 v22, v25, v20 -; SI-NEXT: v_lshr_b64 v[28:29], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 -; SI-NEXT: v_or_b32_e32 v54, v23, v14 -; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[12:13], 16 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_lshr_b64 v[60:61], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v25, v22 -; SI-NEXT: v_lshr_b64 v[22:23], v[20:21], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_or_b32_e32 v19, v15, v0 +; SI-NEXT: v_or_b32_e32 v18, v14, v2 +; SI-NEXT: v_or_b32_e32 v25, v16, v4 +; SI-NEXT: v_or_b32_e32 v20, v17, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v46, s7 +; SI-NEXT: v_mov_b32_e32 v42, s13 +; SI-NEXT: v_mov_b32_e32 v43, s12 +; SI-NEXT: v_mov_b32_e32 v44, s11 +; SI-NEXT: v_mov_b32_e32 v45, s10 +; SI-NEXT: v_mov_b32_e32 v47, s9 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, s28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v61, s26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, s24 +; SI-NEXT: v_mov_b32_e32 v57, s22 +; SI-NEXT: v_mov_b32_e32 v58, s20 +; SI-NEXT: v_mov_b32_e32 v60, s18 +; SI-NEXT: v_mov_b32_e32 v56, s16 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v31, s27 +; SI-NEXT: v_mov_b32_e32 v33, s29 +; SI-NEXT: v_mov_b32_e32 v52, s43 +; SI-NEXT: v_mov_b32_e32 v50, s42 +; SI-NEXT: v_mov_b32_e32 v26, s41 +; SI-NEXT: v_mov_b32_e32 v48, s40 +; SI-NEXT: v_mov_b32_e32 v38, s15 +; SI-NEXT: v_mov_b32_e32 v36, s14 +; SI-NEXT: v_mov_b32_e32 v34, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v22, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_or_b32_e32 v26, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v57 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_or_b32_e32 v23, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v58 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v26 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 1ff6bbd4e9a37..322689c91425b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -5952,433 +5952,186 @@ end: define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v24i32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v48f16: @@ -6874,129 +6627,86 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 ; SI-NEXT: v_mov_b32_e32 v11, s16 ; SI-NEXT: v_mov_b32_e32 v12, s17 ; SI-NEXT: v_mov_b32_e32 v13, s18 ; SI-NEXT: v_mov_b32_e32 v14, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 ; SI-NEXT: v_mov_b32_e32 v16, s21 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_readfirstlane_b32 s41, v12 ; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v13 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v14 ; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v15 ; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_readfirstlane_b32 s29, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s20, v11 -; SI-NEXT: v_readfirstlane_b32 s19, v12 -; SI-NEXT: v_readfirstlane_b32 s18, v13 -; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s15, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: v_readfirstlane_b32 s7, v7 -; SI-NEXT: v_readfirstlane_b32 s6, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -7011,232 +6721,167 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s44, s23, 16 -; SI-NEXT: s_lshr_b32 s45, s22, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s20, 16 -; SI-NEXT: s_lshr_b32 s56, s19, 16 -; SI-NEXT: s_lshr_b32 s57, s18, 16 -; SI-NEXT: s_lshr_b32 s58, s17, 16 -; SI-NEXT: s_lshr_b32 s59, s16, 16 -; SI-NEXT: s_lshr_b32 s60, s15, 16 -; SI-NEXT: s_lshr_b32 s61, s14, 16 -; SI-NEXT: s_lshr_b32 s62, s13, 16 -; SI-NEXT: s_lshr_b32 s63, s12, 16 -; SI-NEXT: s_lshr_b32 s72, s11, 16 -; SI-NEXT: s_lshr_b32 s73, s10, 16 -; SI-NEXT: s_lshr_b32 s74, s8, 16 -; SI-NEXT: s_lshr_b32 s75, s7, 16 -; SI-NEXT: s_lshr_b32 s76, s6, 16 -; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v54, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_or_b32_e32 v12, v35, v12 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v18, v29, v18 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: v_or_b32_e32 v22, v25, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s40, s42, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: @@ -7881,7 +7526,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7898,164 +7542,220 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -8083,124 +7783,20 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8213,10 +7809,10 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8225,194 +7821,198 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -9085,536 +8685,341 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: s_cbranch_execnz .LBB19_3 -; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v48f16_to_v24i32_scalar: @@ -15279,433 +14684,186 @@ end: define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v24f32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48f16: @@ -16178,423 +15336,228 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v25, s17 -; SI-NEXT: v_mov_b32_e32 v24, s18 -; SI-NEXT: v_mov_b32_e32 v23, s19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, s20 -; SI-NEXT: v_mov_b32_e32 v60, s21 -; SI-NEXT: v_mov_b32_e32 v58, s22 -; SI-NEXT: v_mov_b32_e32 v57, s23 -; SI-NEXT: v_mov_b32_e32 v56, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s27 -; SI-NEXT: v_mov_b32_e32 v61, s28 -; SI-NEXT: v_mov_b32_e32 v59, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v52 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: @@ -17293,7 +16256,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -17310,164 +16272,220 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -17495,124 +16513,20 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -17625,10 +16539,10 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -17637,194 +16551,198 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -18497,536 +17415,341 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-LABEL: bitcast_v48f16_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v48f16_to_v24f32_scalar: @@ -23907,190 +22630,60 @@ end: define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v12i64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -24111,228 +22704,112 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48f16: @@ -24840,369 +23317,261 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 ; SI-NEXT: v_mov_b32_e32 v11, s16 ; SI-NEXT: v_mov_b32_e32 v12, s17 ; SI-NEXT: v_mov_b32_e32 v13, s18 ; SI-NEXT: v_mov_b32_e32 v14, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 ; SI-NEXT: v_mov_b32_e32 v16, s21 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_readfirstlane_b32 s41, v12 ; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v13 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v14 ; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v15 ; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_readfirstlane_b32 s28, v16 -; SI-NEXT: v_readfirstlane_b32 s22, v17 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_readfirstlane_b32 s20, v19 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s6, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s40, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s29, s5, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s40, s25, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s42, s26, 16 -; SI-NEXT: s_lshr_b32 s43, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s44, s22, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: s_lshr_b32 s47, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s56, s18, 16 -; SI-NEXT: s_lshr_b32 s57, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s58, s16, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s62, s12, 16 -; SI-NEXT: s_lshr_b32 s63, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s72, s10, 16 -; SI-NEXT: s_lshr_b32 s73, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s74, s7, 16 -; SI-NEXT: s_lshr_b32 s75, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s76, s6, 16 -; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v54, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_or_b32_e32 v12, v35, v12 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v18, v29, v18 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: v_or_b32_e32 v22, v25, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s40, s42, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: @@ -25847,7 +24216,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25864,164 +24232,220 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -26049,124 +24473,20 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -26179,10 +24499,10 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26191,194 +24511,198 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -27051,536 +25375,341 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v48f16_to_v12i64_scalar: @@ -31613,189 +29742,65 @@ end: define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v12f64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 @@ -31803,219 +29808,108 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48f16: @@ -32474,404 +30368,206 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v15, s23 ; SI-NEXT: v_mov_b32_e32 v16, s24 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v62 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v58 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: @@ -33546,7 +31242,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -33563,164 +31258,220 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -33748,124 +31499,20 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33878,10 +31525,10 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -33890,194 +31537,198 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -34750,536 +32401,341 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-LABEL: bitcast_v48f16_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v48f16_to_v12f64_scalar: @@ -35874,7 +33330,30 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v48f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -35918,19 +33397,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -35947,540 +33413,490 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v54 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_or_b32_e32 v46, v1, v24 +; SI-NEXT: v_alignbit_b32 v1, v46, v58, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v44, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v43, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v41, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v55, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v55, v51, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_alignbit_b32 v1, v52, v40, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_or_b32_e32 v49, v1, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v1, v49, v45, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v38, v1, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_alignbit_b32 v1, v38, v59, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_or_b32_e32 v35, v1, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v1, v35, v63, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v33, v1, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_alignbit_b32 v1, v33, v48, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v31, v1, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v1, v31, v42, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v24, v1, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v1, v24, v56, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v56, v22 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v25, v20 +; SI-NEXT: v_or_b32_e32 v18, v48, v18 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v54, v16 +; SI-NEXT: v_or_b32_e32 v14, v59, v14 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v10, v40, v10 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v36, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v46, v2, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v14, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v35, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v31, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v24, v27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v63 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -36497,12 +33913,56 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v48f16: @@ -36965,411 +34425,503 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-LABEL: bitcast_v48i16_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 +; SI-NEXT: v_writelane_b32 v24, s31, 1 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_writelane_b32 v24, s35, 3 +; SI-NEXT: v_writelane_b32 v24, s36, 4 +; SI-NEXT: v_writelane_b32 v24, s37, 5 +; SI-NEXT: v_writelane_b32 v24, s38, 6 +; SI-NEXT: v_writelane_b32 v24, s39, 7 +; SI-NEXT: v_writelane_b32 v24, s48, 8 +; SI-NEXT: v_writelane_b32 v24, s49, 9 +; SI-NEXT: v_writelane_b32 v24, s50, 10 +; SI-NEXT: v_writelane_b32 v24, s51, 11 +; SI-NEXT: v_writelane_b32 v24, s52, 12 +; SI-NEXT: v_writelane_b32 v24, s53, 13 +; SI-NEXT: v_writelane_b32 v24, s54, 14 +; SI-NEXT: v_writelane_b32 v24, s55, 15 +; SI-NEXT: v_writelane_b32 v24, s64, 16 +; SI-NEXT: v_writelane_b32 v24, s65, 17 +; SI-NEXT: v_writelane_b32 v24, s66, 18 +; SI-NEXT: v_writelane_b32 v24, s67, 19 +; SI-NEXT: v_writelane_b32 v24, s68, 20 +; SI-NEXT: v_writelane_b32 v24, s69, 21 +; SI-NEXT: v_writelane_b32 v24, s70, 22 +; SI-NEXT: v_writelane_b32 v24, s71, 23 +; SI-NEXT: v_writelane_b32 v24, s80, 24 +; SI-NEXT: v_writelane_b32 v24, s81, 25 +; SI-NEXT: v_writelane_b32 v24, s82, 26 +; SI-NEXT: v_writelane_b32 v24, s83, 27 +; SI-NEXT: v_writelane_b32 v24, s84, 28 +; SI-NEXT: v_writelane_b32 v24, s85, 29 +; SI-NEXT: v_writelane_b32 v24, s86, 30 +; SI-NEXT: v_writelane_b32 v24, s87, 31 +; SI-NEXT: v_writelane_b32 v24, s96, 32 +; SI-NEXT: v_writelane_b32 v24, s97, 33 +; SI-NEXT: v_writelane_b32 v24, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s85, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s70, s28, 16 +; SI-NEXT: s_lshr_b32 s49, s27, 16 +; SI-NEXT: s_lshr_b32 s69, s26, 16 +; SI-NEXT: s_lshr_b32 s48, s25, 16 +; SI-NEXT: s_lshr_b32 s68, s24, 16 +; SI-NEXT: s_lshr_b32 s39, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s22, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s66, s20, 16 +; SI-NEXT: s_lshr_b32 s37, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s18, 16 +; SI-NEXT: s_lshr_b32 s36, s17, 16 +; SI-NEXT: s_lshr_b32 s61, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_writelane_b32 v24, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s64, v8 +; SI-NEXT: v_readfirstlane_b32 s87, v7 +; SI-NEXT: v_readfirstlane_b32 s97, v6 +; SI-NEXT: v_readfirstlane_b32 s83, v5 +; SI-NEXT: v_readfirstlane_b32 s86, v4 +; SI-NEXT: v_readfirstlane_b32 s81, v3 +; SI-NEXT: v_readfirstlane_b32 s82, v2 +; SI-NEXT: v_readfirstlane_b32 s71, v1 +; SI-NEXT: v_readfirstlane_b32 s80, v0 +; SI-NEXT: v_readfirstlane_b32 s55, v11 +; SI-NEXT: v_readfirstlane_b32 s65, v12 +; SI-NEXT: v_readfirstlane_b32 s53, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s54, v15 +; SI-NEXT: v_readfirstlane_b32 s98, v16 +; SI-NEXT: v_readfirstlane_b32 s52, v17 +; SI-NEXT: v_readfirstlane_b32 s96, v18 +; SI-NEXT: v_readfirstlane_b32 s51, v19 +; SI-NEXT: v_readfirstlane_b32 s84, v9 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_mov_b32_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_mov_b32_e32 v60, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_mov_b32_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s60, s61, 16 +; SI-NEXT: s_mov_b32 s9, s61 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s44, s4, s60 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s62, s63, 16 +; SI-NEXT: s_mov_b32 s11, s63 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s38, 16 +; SI-NEXT: s_or_b32 s42, s4, s62 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s72, s66, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: s_or_b32 s40, s4, s72 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s74, s67, 16 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_or_b32 s14, s4, s74 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s76, s68, 16 +; SI-NEXT: s_or_b32 s77, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_or_b32 s12, s4, s76 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s78, s69, 16 +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_or_b32 s10, s4, s78 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s88, s70, 16 +; SI-NEXT: s_or_b32 s89, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_or_b32 s8, s4, s88 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s90, s84, 16 +; SI-NEXT: s_or_b32 s91, s5, s7 +; SI-NEXT: s_and_b32 s5, s81, 0xffff +; SI-NEXT: s_lshl_b32 s7, s52, 16 +; SI-NEXT: s_or_b32 s6, s4, s90 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s58, s96, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s83, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_or_b32 s4, s4, s58 +; SI-NEXT: s_lshl_b32 s56, s98, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s87, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_and_b32 s58, s86, 0xffff +; SI-NEXT: s_lshl_b32 s46, s99, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s85, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s97, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_lshl_b32 vcc_lo, s65, 16 +; SI-NEXT: s_mov_b32 s45, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s43, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s64, 0xffff +; SI-NEXT: s_mov_b32 s61, s9 +; SI-NEXT: s_mov_b32 s63, s11 +; SI-NEXT: s_mov_b32 s41, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s15, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s13, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 +; SI-NEXT: s_mov_b32 s11, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 +; SI-NEXT: s_mov_b32 s9, s89 +; SI-NEXT: s_lshr_b64 s[88:89], s[88:89], 16 +; SI-NEXT: s_mov_b32 s7, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 +; SI-NEXT: s_mov_b32 s5, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_or_b32 s46, s46, vcc_lo +; SI-NEXT: s_mov_b32 s47, vcc_hi +; SI-NEXT: s_lshr_b64 s[34:35], vcc, 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s64, s64, 3 +; SI-NEXT: s_and_b32 s4, s64, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s86, s86, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s86, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s81, 0xffff +; SI-NEXT: s_lshl_b32 s6, s52, 16 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s80, 0xffff +; SI-NEXT: s_lshl_b32 s7, s84, 16 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s71, 0xffff +; SI-NEXT: s_lshl_b32 s8, s51, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s28, 0xffff +; SI-NEXT: s_lshl_b32 s9, s70, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s10, s50, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s26, 0xffff +; SI-NEXT: s_lshl_b32 s11, s69, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s12, s49, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s68, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s48, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s15, s67, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s15, s22, s15 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s66, 16 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s38, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s41, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s63, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s42, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s37, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s43, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s61, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b32 s36, s45, 16 +; SI-NEXT: s_lshr_b32 s37, s43, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s13, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s9, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 16 +; SI-NEXT: s_lshr_b32 s52, s5, 16 +; SI-NEXT: s_lshr_b32 s54, s59, 16 +; SI-NEXT: s_lshr_b32 s53, s57, 16 +; SI-NEXT: s_lshr_b32 s55, s47, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xffff +; SI-NEXT: s_lshl_b32 s18, s36, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s42, 0xffff +; SI-NEXT: s_lshl_b32 s19, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s43, 0xffff +; SI-NEXT: s_lshl_b32 s20, s37, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s40, 0xffff +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s41, 0xffff +; SI-NEXT: s_lshl_b32 s22, s38, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s22, s74, 16 +; SI-NEXT: s_or_b32 s14, s14, s22 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s22 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s22, s76, 16 +; SI-NEXT: s_or_b32 s12, s12, s22 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s22, s48, 16 +; SI-NEXT: s_or_b32 s13, s13, s22 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s22, s78, 16 +; SI-NEXT: s_or_b32 s10, s10, s22 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s22, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s22 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s22, s88, 16 +; SI-NEXT: s_or_b32 s8, s8, s22 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s22, s50, 16 +; SI-NEXT: s_or_b32 s9, s9, s22 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s22, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s22 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s22, s51, 16 +; SI-NEXT: s_or_b32 s7, s7, s22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s92, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s52, 16 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: s_and_b32 s22, s58, 0xffff +; SI-NEXT: s_lshl_b32 s23, s94, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s59, 0xffff +; SI-NEXT: s_lshl_b32 s24, s54, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s56, 0xffff +; SI-NEXT: s_lshl_b32 s25, s30, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s57, 0xffff +; SI-NEXT: s_lshl_b32 s26, s53, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s46, 0xffff +; SI-NEXT: s_lshl_b32 s27, s34, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s28, s55, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_readlane_b32 s99, v24, 35 +; SI-NEXT: v_readlane_b32 s98, v24, 34 +; SI-NEXT: v_readlane_b32 s97, v24, 33 +; SI-NEXT: v_readlane_b32 s96, v24, 32 +; SI-NEXT: v_readlane_b32 s87, v24, 31 +; SI-NEXT: v_readlane_b32 s86, v24, 30 +; SI-NEXT: v_readlane_b32 s85, v24, 29 +; SI-NEXT: v_readlane_b32 s84, v24, 28 +; SI-NEXT: v_readlane_b32 s83, v24, 27 +; SI-NEXT: v_readlane_b32 s82, v24, 26 +; SI-NEXT: v_readlane_b32 s81, v24, 25 +; SI-NEXT: v_readlane_b32 s80, v24, 24 +; SI-NEXT: v_readlane_b32 s71, v24, 23 +; SI-NEXT: v_readlane_b32 s70, v24, 22 +; SI-NEXT: v_readlane_b32 s69, v24, 21 +; SI-NEXT: v_readlane_b32 s68, v24, 20 +; SI-NEXT: v_readlane_b32 s67, v24, 19 +; SI-NEXT: v_readlane_b32 s66, v24, 18 +; SI-NEXT: v_readlane_b32 s65, v24, 17 +; SI-NEXT: v_readlane_b32 s64, v24, 16 +; SI-NEXT: v_readlane_b32 s55, v24, 15 +; SI-NEXT: v_readlane_b32 s54, v24, 14 +; SI-NEXT: v_readlane_b32 s53, v24, 13 +; SI-NEXT: v_readlane_b32 s52, v24, 12 +; SI-NEXT: v_readlane_b32 s51, v24, 11 +; SI-NEXT: v_readlane_b32 s50, v24, 10 +; SI-NEXT: v_readlane_b32 s49, v24, 9 +; SI-NEXT: v_readlane_b32 s48, v24, 8 +; SI-NEXT: v_readlane_b32 s39, v24, 7 +; SI-NEXT: v_readlane_b32 s38, v24, 6 +; SI-NEXT: v_readlane_b32 s37, v24, 5 +; SI-NEXT: v_readlane_b32 s36, v24, 4 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: v_mov_b32_e32 v15, v12 -; SI-NEXT: v_mov_b32_e32 v14, v57 -; SI-NEXT: v_mov_b32_e32 v12, v47 -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v60, v41 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v48i16_to_v48f16_scalar: @@ -38130,425 +35682,318 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v23, v23, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v25, v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v28, v28, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v20, v20, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v16, v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v31, v31, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_or_b32_e32 v10, v10, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v34, v34, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_or_b32_e32 v36, v36, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v55 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: v_or_b32_e32 v35, v35, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v32, v32, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v42 -; SI-NEXT: v_or_b32_e32 v12, v12, v49 -; SI-NEXT: v_or_b32_e32 v29, v29, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v43 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v26, v26, v37 -; SI-NEXT: v_or_b32_e32 v27, v27, v44 -; SI-NEXT: v_alignbit_b32 v40, v2, v24, 16 -; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v41, 16 -; SI-NEXT: v_alignbit_b32 v53, v34, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v10, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v42, 16 -; SI-NEXT: v_alignbit_b32 v50, v31, v49, 16 -; SI-NEXT: v_alignbit_b32 v49, v16, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v43, 16 -; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v25, v44, 16 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_alignbit_b32 v55, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v54, v3, v54, 16 +; SI-NEXT: v_alignbit_b32 v53, v5, v53, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v52, 16 +; SI-NEXT: v_alignbit_b32 v51, v9, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v50, 16 +; SI-NEXT: v_alignbit_b32 v48, v13, v48, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v39, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v37, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v34, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v31, 16 +; SI-NEXT: v_alignbit_b32 v28, v23, v28, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 ; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 ; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v49 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 ; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 ; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v54 ; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -39013,439 +36458,369 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s42, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 -; SI-NEXT: v_or_b32_e32 v27, v14, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_or_b32_e32 v13, v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v28, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_or_b32_e32 v34, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v36, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: v_or_b32_e32 v59, v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_or_b32_e32 v31, v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_or_b32_e32 v31, v16, v10 +; SI-NEXT: v_or_b32_e32 v43, v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v58, v14, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v32, v16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_or_b32_e32 v61, v18, v27 +; SI-NEXT: v_or_b32_e32 v41, v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_or_b32_e32 v62, v16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 +; SI-NEXT: v_or_b32_e32 v42, v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_or_b32_e32 v62, v16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v63, v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v26, v22, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v28, v22, v20 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v29, v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 -; SI-NEXT: v_or_b32_e32 v21, v21, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v24 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_or_b32_e32 v13, v13, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v24 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[8:9], 16 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_mov_b32_e32 v39, v32 -; SI-NEXT: v_lshr_b64 v[36:37], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_mov_b32_e32 v37, v29 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_mov_b32_e32 v31, v27 -; SI-NEXT: v_mov_b32_e32 v29, v26 -; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[54:55], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[6:7], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[29:30], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[27:28], 16 +; SI-NEXT: v_or_b32_e32 v23, v15, v2 +; SI-NEXT: v_or_b32_e32 v24, v16, v4 +; SI-NEXT: v_or_b32_e32 v37, v18, v8 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v51, v48 +; SI-NEXT: v_lshr_b64 v[48:49], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: v_lshr_b64 v[15:16], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: v_or_b32_e32 v45, v17, v6 +; SI-NEXT: v_lshr_b64 v[39:40], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[8:9], 16 +; SI-NEXT: v_or_b32_e32 v14, v14, v0 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, s8 +; SI-NEXT: v_mov_b32_e32 v59, s10 +; SI-NEXT: v_mov_b32_e32 v58, s12 +; SI-NEXT: v_mov_b32_e32 v57, s14 +; SI-NEXT: v_mov_b32_e32 v56, s15 +; SI-NEXT: v_mov_b32_e32 v46, s41 +; SI-NEXT: v_mov_b32_e32 v47, s40 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s28 +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: v_mov_b32_e32 v61, s24 +; SI-NEXT: v_mov_b32_e32 v42, s22 +; SI-NEXT: v_mov_b32_e32 v43, s20 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s16 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v28, s25 +; SI-NEXT: v_mov_b32_e32 v34, s27 +; SI-NEXT: v_mov_b32_e32 v36, s29 +; SI-NEXT: v_mov_b32_e32 v39, s43 +; SI-NEXT: v_mov_b32_e32 v54, s42 +; SI-NEXT: v_mov_b32_e32 v29, s13 +; SI-NEXT: v_mov_b32_e32 v52, s11 +; SI-NEXT: v_mov_b32_e32 v50, s9 +; SI-NEXT: v_mov_b32_e32 v48, s7 +; SI-NEXT: v_mov_b32_e32 v32, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v63 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_or_b32_e32 v27, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v25, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v29, v6, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v52 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v8, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; SI-NEXT: v_or_b32_e32 v28, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v60 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v60 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39462,12 +36837,24 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v3, v25 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v9, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 18de1fc68024e..911c911fa1ad4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -6416,490 +6416,217 @@ end: define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v26i32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7448,6 +7175,16 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 ; SI-NEXT: v_mov_b32_e32 v13, s16 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: v_mov_b32_e32 v15, s18 @@ -7455,138 +7192,80 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s20 ; SI-NEXT: v_mov_b32_e32 v18, s21 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 ; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_readfirstlane_b32 s43, v14 ; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v15 ; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_readfirstlane_b32 s41, v16 ; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v17 +; SI-NEXT: v_readfirstlane_b32 s24, v17 ; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v18 ; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_readfirstlane_b32 s29, v13 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v19 -; SI-NEXT: v_readfirstlane_b32 s17, v0 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: v_readfirstlane_b32 s7, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -7601,258 +7280,184 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 16 -; SI-NEXT: s_lshr_b32 s44, s26, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: s_lshr_b32 s47, s29, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s22, 16 -; SI-NEXT: s_lshr_b32 s58, s21, 16 -; SI-NEXT: s_lshr_b32 s59, s20, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 16 -; SI-NEXT: s_lshr_b32 s61, s18, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s16, 16 -; SI-NEXT: s_lshr_b32 s72, s15, 16 -; SI-NEXT: s_lshr_b32 s73, s14, 16 -; SI-NEXT: s_lshr_b32 s74, s13, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 16 -; SI-NEXT: s_lshr_b32 s76, s11, 16 -; SI-NEXT: s_lshr_b32 s77, s10, 16 -; SI-NEXT: s_lshr_b32 s78, s8, 16 -; SI-NEXT: s_lshr_b32 s79, s7, 16 -; SI-NEXT: s_lshr_b32 s88, s6, 16 -; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 -; SI-NEXT: v_or_b32_e32 v13, v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v21, v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v28, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 -; SI-NEXT: v_or_b32_e32 v4, v55, v4 -; SI-NEXT: v_or_b32_e32 v6, v53, v6 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v12, v39, v12 -; SI-NEXT: v_or_b32_e32 v14, v37, v14 -; SI-NEXT: v_or_b32_e32 v16, v35, v16 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v20, v31, v20 -; SI-NEXT: v_or_b32_e32 v22, v29, v22 -; SI-NEXT: v_or_b32_e32 v24, v27, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s42, s44, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: @@ -8542,7 +8147,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -8559,186 +8163,243 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -8774,134 +8435,20 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8914,10 +8461,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8926,118 +8473,121 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -9045,98 +8595,98 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -9867,575 +9417,378 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v52f16_to_v26i32_scalar: @@ -16589,490 +15942,217 @@ end: define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v26f32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17596,489 +16676,260 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v27, s17 -; SI-NEXT: v_mov_b32_e32 v26, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v24, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s21 -; SI-NEXT: v_mov_b32_e32 v62, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v30, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v32, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v33 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: @@ -18846,7 +17697,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -18863,186 +17713,243 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -19078,134 +17985,20 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19218,10 +18011,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19230,118 +18023,121 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -19349,98 +18145,98 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -20171,575 +18967,378 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-LABEL: bitcast_v52f16_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v52f16_to_v26f32_scalar: @@ -26043,214 +24642,72 @@ end: define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v13i64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -26267,7 +24724,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 @@ -26278,263 +24734,133 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v13i64_to_v52f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v52f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill @@ -27089,6 +25415,16 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 ; SI-NEXT: v_mov_b32_e32 v13, s16 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: v_mov_b32_e32 v15, s18 @@ -27096,404 +25432,272 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s20 ; SI-NEXT: v_mov_b32_e32 v18, s21 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 ; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_readfirstlane_b32 s43, v14 ; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v15 ; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_readfirstlane_b32 s41, v16 ; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_readfirstlane_b32 s24, v17 ; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v18 ; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_readfirstlane_b32 s28, v13 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: v_readfirstlane_b32 s17, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s41, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s41, s42, 0 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s41, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: s_lshr_b32 s47, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s56, s22, 16 -; SI-NEXT: s_lshr_b32 s57, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s58, s20, 16 -; SI-NEXT: s_lshr_b32 s59, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s60, s18, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s62, s16, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s72, s14, 16 -; SI-NEXT: s_lshr_b32 s73, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s76, s10, 16 -; SI-NEXT: s_lshr_b32 s77, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s78, s7, 16 -; SI-NEXT: s_lshr_b32 s79, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s88, s6, 16 -; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 -; SI-NEXT: v_or_b32_e32 v13, v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v21, v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v28, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 -; SI-NEXT: v_or_b32_e32 v4, v55, v4 -; SI-NEXT: v_or_b32_e32 v6, v53, v6 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v12, v39, v12 -; SI-NEXT: v_or_b32_e32 v14, v37, v14 -; SI-NEXT: v_or_b32_e32 v16, v35, v16 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v20, v31, v20 -; SI-NEXT: v_or_b32_e32 v22, v29, v22 -; SI-NEXT: v_or_b32_e32 v24, v27, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s42, s44, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: @@ -28183,7 +26387,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -28200,186 +26403,243 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -28415,134 +26675,20 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28555,10 +26701,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28567,118 +26713,121 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -28686,98 +26835,98 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -29508,575 +27657,378 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v52f16_to_v13i64_scalar: @@ -34510,465 +32462,204 @@ end: define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v13f64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35481,462 +33172,232 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v51, v10 -; SI-NEXT: v_mov_b32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: @@ -36678,7 +34139,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -36695,186 +34155,243 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -36910,134 +34427,20 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -37050,10 +34453,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -37062,118 +34465,121 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -37181,98 +34587,98 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -38003,575 +35409,378 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-LABEL: bitcast_v52f16_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v52f16_to_v13f64_scalar: @@ -39213,7 +36422,58 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v52f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -39257,488 +36517,561 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v55 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_or_b32_e32 v59, v1, v26 +; SI-NEXT: v_mov_b32_e32 v26, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v57, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v47, v41, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v45, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v43, v58, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v42, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v42, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v55, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v1, v55, v60, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v52, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v49, v1, v29 +; SI-NEXT: v_alignbit_b32 v1, v49, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v38, v1, v30 +; SI-NEXT: v_alignbit_b32 v1, v38, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v36, v1, v51 +; SI-NEXT: v_alignbit_b32 v1, v36, v27, 16 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v34, v1, v48 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_alignbit_b32 v1, v34, v28, 16 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v3, v1, v31 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_alignbit_b32 v1, v3, v44, 16 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v44, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x30000, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v24, v31, v24 +; SI-NEXT: v_or_b32_e32 v22, v28, v22 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v48, v22 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v51, v20 +; SI-NEXT: v_or_b32_e32 v18, v62, v18 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_or_b32_e32 v16, v61, v16 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v14, v56, v14 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v59, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v45, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v14, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v36, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v34, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v26, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39755,172 +37088,50 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v52f16: @@ -40437,495 +37648,555 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-LABEL: bitcast_v52i16_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_writelane_b32 v26, s48, 8 +; SI-NEXT: v_writelane_b32 v26, s49, 9 +; SI-NEXT: v_writelane_b32 v26, s50, 10 +; SI-NEXT: v_writelane_b32 v26, s51, 11 +; SI-NEXT: v_writelane_b32 v26, s52, 12 +; SI-NEXT: v_writelane_b32 v26, s53, 13 +; SI-NEXT: v_writelane_b32 v26, s54, 14 +; SI-NEXT: v_writelane_b32 v26, s55, 15 +; SI-NEXT: v_writelane_b32 v26, s64, 16 +; SI-NEXT: v_writelane_b32 v26, s65, 17 +; SI-NEXT: v_writelane_b32 v26, s66, 18 +; SI-NEXT: v_writelane_b32 v26, s67, 19 +; SI-NEXT: v_writelane_b32 v26, s68, 20 +; SI-NEXT: v_writelane_b32 v26, s69, 21 +; SI-NEXT: v_writelane_b32 v26, s70, 22 +; SI-NEXT: v_writelane_b32 v26, s71, 23 +; SI-NEXT: v_writelane_b32 v26, s80, 24 +; SI-NEXT: v_writelane_b32 v26, s81, 25 +; SI-NEXT: v_writelane_b32 v26, s82, 26 +; SI-NEXT: v_writelane_b32 v26, s83, 27 +; SI-NEXT: v_writelane_b32 v26, s84, 28 +; SI-NEXT: v_writelane_b32 v26, s85, 29 +; SI-NEXT: v_writelane_b32 v26, s86, 30 +; SI-NEXT: v_writelane_b32 v26, s87, 31 +; SI-NEXT: v_writelane_b32 v26, s96, 32 +; SI-NEXT: v_writelane_b32 v26, s97, 33 +; SI-NEXT: v_writelane_b32 v26, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_readfirstlane_b32 s98, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s70, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s87, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_lshr_b32 s54, s29, 16 +; SI-NEXT: s_lshr_b32 s91, s28, 16 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s89, s26, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s79, s24, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s22, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s20, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_writelane_b32 v26, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s80, v6 +; SI-NEXT: v_readfirstlane_b32 s97, v5 +; SI-NEXT: v_readfirstlane_b32 s99, v4 +; SI-NEXT: v_readfirstlane_b32 s31, v3 +; SI-NEXT: v_readfirstlane_b32 s96, v2 +; SI-NEXT: v_readfirstlane_b32 s93, v1 +; SI-NEXT: v_readfirstlane_b32 s95, v0 +; SI-NEXT: v_readfirstlane_b32 s66, v13 +; SI-NEXT: v_readfirstlane_b32 s85, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s68, v15 +; SI-NEXT: v_readfirstlane_b32 s86, v16 +; SI-NEXT: v_readfirstlane_b32 s67, v17 +; SI-NEXT: v_readfirstlane_b32 s84, v18 +; SI-NEXT: v_readfirstlane_b32 s65, v19 +; SI-NEXT: v_readfirstlane_b32 s83, v11 +; SI-NEXT: v_readfirstlane_b32 s64, v10 +; SI-NEXT: v_readfirstlane_b32 s81, v9 +; SI-NEXT: v_readfirstlane_b32 s55, v8 +; SI-NEXT: v_readfirstlane_b32 s69, v7 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_mov_b32_e32 v22, v16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_mov_b32_e32 v23, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v24, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_mov_b32_e32 v25, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v21, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 -; SI-NEXT: v_mov_b32_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_mov_b32_e32 v16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v17, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_mov_b32_e32 v18, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v19, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 -; SI-NEXT: v_mov_b32_e32 v20, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_lshl_b32 s62, s63, 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_lshl_b32 s72, s73, 16 +; SI-NEXT: s_mov_b32 s11, s73 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_lshl_b32 s74, s75, 16 +; SI-NEXT: s_mov_b32 s88, s75 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_lshl_b32 s76, s77, 16 +; SI-NEXT: s_lshl_b32 s60, s89, 16 +; SI-NEXT: s_lshl_b32 s58, s91, 16 +; SI-NEXT: s_mov_b32 s92, s91 +; SI-NEXT: s_mov_b32 s91, s89 +; SI-NEXT: s_mov_b32 s89, s77 +; SI-NEXT: s_or_b32 s77, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s52, 16 +; SI-NEXT: s_lshl_b32 s78, s79, 16 +; SI-NEXT: s_mov_b32 s90, s79 +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s93, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s14, s4, s62 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s12, s4, s72 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s97, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s10, s4, s74 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s87, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s8, s4, s76 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s98, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_mov_b32 s15, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s13, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_or_b32 s6, s4, s78 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_mov_b32 s63, s9 +; SI-NEXT: s_mov_b32 s73, s11 +; SI-NEXT: s_mov_b32 s11, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s9, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 +; SI-NEXT: s_or_b32 s4, s4, s60 +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_mov_b32 s75, s88 +; SI-NEXT: s_mov_b32 s77, s89 +; SI-NEXT: s_mov_b32 s7, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 +; SI-NEXT: s_and_b32 s60, s28, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_mov_b32 s79, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_or_b32 s60, s60, s58 +; SI-NEXT: s_lshr_b64 s[90:91], s[58:59], 16 +; SI-NEXT: s_and_b32 s58, s95, 0xffff +; SI-NEXT: s_lshl_b32 s46, s81, 16 +; SI-NEXT: s_mov_b32 s91, s92 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_mov_b32 s94, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s96, 0xffff +; SI-NEXT: s_lshl_b32 s44, s83, 16 +; SI-NEXT: s_mov_b32 s93, s94 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_mov_b32 s30, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s99, 0xffff +; SI-NEXT: s_lshl_b32 s42, s84, 16 +; SI-NEXT: s_mov_b32 s95, s30 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_mov_b32 s34, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s80, 0xffff +; SI-NEXT: s_lshl_b32 s40, s86, 16 +; SI-NEXT: s_mov_b32 s31, s34 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s70, 0xffff +; SI-NEXT: s_lshl_b32 vcc_lo, s85, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[36:37], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s82, 0xffff +; SI-NEXT: s_mov_b32 s5, s61 +; SI-NEXT: s_mov_b32 s61, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_or_b32 s40, s40, vcc_lo +; SI-NEXT: s_mov_b32 s41, vcc_hi +; SI-NEXT: s_lshr_b64 s[38:39], vcc, 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s98, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s99, s99, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s96, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s31, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s95, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s93, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s60, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s61, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s6, s53, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s25, 0xffff +; SI-NEXT: s_lshl_b32 s8, s52, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s77, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s10, s51, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s50, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s18, 0xffff +; SI-NEXT: s_lshl_b32 s13, s73, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s14, s49, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s16, 0xffff +; SI-NEXT: s_lshl_b32 s15, s63, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 +; SI-NEXT: s_lshr_b32 s48, s15, 16 +; SI-NEXT: s_lshr_b32 s49, s13, 16 +; SI-NEXT: s_lshr_b32 s50, s11, 16 +; SI-NEXT: s_lshr_b32 s51, s9, 16 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s5, 16 +; SI-NEXT: s_lshr_b32 s54, s61, 16 +; SI-NEXT: s_lshr_b32 s55, s59, 16 +; SI-NEXT: s_lshr_b32 s64, s57, 16 +; SI-NEXT: s_lshr_b32 s65, s47, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s43, 16 +; SI-NEXT: s_lshr_b32 s66, s41, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v56 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s49, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s50, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s51, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s52, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s53, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xffff +; SI-NEXT: s_lshl_b32 s18, s54, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s58, 0xffff +; SI-NEXT: s_lshl_b32 s19, s92, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s59, 0xffff +; SI-NEXT: s_lshl_b32 s20, s55, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s56, 0xffff +; SI-NEXT: s_lshl_b32 s21, s94, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s57, 0xffff +; SI-NEXT: s_lshl_b32 s22, s64, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s46, 0xffff +; SI-NEXT: s_lshl_b32 s23, s30, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s47, 0xffff +; SI-NEXT: s_lshl_b32 s24, s65, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s44, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s45, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s42, 0xffff +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s28, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s38, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s66, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_readlane_b32 s99, v26, 35 +; SI-NEXT: v_readlane_b32 s98, v26, 34 +; SI-NEXT: v_readlane_b32 s97, v26, 33 +; SI-NEXT: v_readlane_b32 s96, v26, 32 +; SI-NEXT: v_readlane_b32 s87, v26, 31 +; SI-NEXT: v_readlane_b32 s86, v26, 30 +; SI-NEXT: v_readlane_b32 s85, v26, 29 +; SI-NEXT: v_readlane_b32 s84, v26, 28 +; SI-NEXT: v_readlane_b32 s83, v26, 27 +; SI-NEXT: v_readlane_b32 s82, v26, 26 +; SI-NEXT: v_readlane_b32 s81, v26, 25 +; SI-NEXT: v_readlane_b32 s80, v26, 24 +; SI-NEXT: v_readlane_b32 s71, v26, 23 +; SI-NEXT: v_readlane_b32 s70, v26, 22 +; SI-NEXT: v_readlane_b32 s69, v26, 21 +; SI-NEXT: v_readlane_b32 s68, v26, 20 +; SI-NEXT: v_readlane_b32 s67, v26, 19 +; SI-NEXT: v_readlane_b32 s66, v26, 18 +; SI-NEXT: v_readlane_b32 s65, v26, 17 +; SI-NEXT: v_readlane_b32 s64, v26, 16 +; SI-NEXT: v_readlane_b32 s55, v26, 15 +; SI-NEXT: v_readlane_b32 s54, v26, 14 +; SI-NEXT: v_readlane_b32 s53, v26, 13 +; SI-NEXT: v_readlane_b32 s52, v26, 12 +; SI-NEXT: v_readlane_b32 s51, v26, 11 +; SI-NEXT: v_readlane_b32 s50, v26, 10 +; SI-NEXT: v_readlane_b32 s49, v26, 9 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v25, v19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v24, v18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v23, v17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v22, v16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: v_mov_b32_e32 v21, v14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: v_mov_b32_e32 v20, v63 -; SI-NEXT: v_mov_b32_e32 v19, v61 -; SI-NEXT: v_mov_b32_e32 v18, v47 -; SI-NEXT: v_mov_b32_e32 v17, v46 -; SI-NEXT: v_mov_b32_e32 v16, v45 -; SI-NEXT: v_mov_b32_e32 v14, v43 -; SI-NEXT: v_mov_b32_e32 v12, v41 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v52i16_to_v52f16_scalar: @@ -41749,7 +39020,6 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill @@ -41759,137 +39029,35 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41934,212 +39102,216 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v28, v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v24, v24, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v29, v29, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v31, v31, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v17, v17, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v32, v32, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v34, v34, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_or_b32_e32 v12, v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v37, v37, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_or_b32_e32 v39, v39, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v55 -; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_or_b32_e32 v38, v38, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v41 -; SI-NEXT: v_or_b32_e32 v8, v8, v40 -; SI-NEXT: v_or_b32_e32 v36, v36, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 ; SI-NEXT: v_or_b32_e32 v10, v10, v54 -; SI-NEXT: v_or_b32_e32 v14, v14, v53 -; SI-NEXT: v_or_b32_e32 v33, v33, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 ; SI-NEXT: v_or_b32_e32 v16, v16, v51 -; SI-NEXT: v_or_b32_e32 v20, v20, v50 -; SI-NEXT: v_or_b32_e32 v30, v30, v47 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v22, v22, v48 -; SI-NEXT: v_or_b32_e32 v27, v27, v56 -; SI-NEXT: v_alignbit_b32 v44, v2, v26, 16 -; SI-NEXT: v_alignbit_b32 v43, v39, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v6, v41, 16 -; SI-NEXT: v_alignbit_b32 v41, v37, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v35, v45, 16 -; SI-NEXT: v_alignbit_b32 v55, v12, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v34, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, v32, v46, 16 -; SI-NEXT: v_alignbit_b32 v52, v18, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v31, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, v29, v47, 16 -; SI-NEXT: v_alignbit_b32 v49, v24, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v28, v56, 16 +; SI-NEXT: v_or_b32_e32 v24, v24, v56 +; SI-NEXT: v_alignbit_b32 v44, v1, v26, 16 +; SI-NEXT: v_alignbit_b32 v43, v3, v43, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v7, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v9, v45, 16 +; SI-NEXT: v_alignbit_b32 v55, v11, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v13, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v15, v46, 16 +; SI-NEXT: v_alignbit_b32 v52, v17, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, v21, v47, 16 +; SI-NEXT: v_alignbit_b32 v49, v23, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v56, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 ; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v26 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload @@ -42151,63 +39323,59 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v26 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v16, v16, v26 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v26 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 ; SI-NEXT: v_or_b32_e32 v20, v20, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v26 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v48 ; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -42726,488 +39894,414 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s28, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 ; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 -; SI-NEXT: v_or_b32_e32 v20, v20, v0 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_or_b32_e32 v2, v4, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v51, v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_or_b32_e32 v34, v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_or_b32_e32 v59, v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v17, v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_or_b32_e32 v38, v22, v2 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_or_b32_e32 v27, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 -; SI-NEXT: v_or_b32_e32 v32, v22, v4 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v49, v20, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 +; SI-NEXT: v_or_b32_e32 v26, v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v36, v20, v8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v25, v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v28 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v28, v13, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s27 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v22, v22, v10 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v12 -; SI-NEXT: v_or_b32_e32 v20, v20, v14 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v48, v24, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_or_b32_e32 v50, v20, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v28, v24, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v27, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 -; SI-NEXT: v_or_b32_e32 v37, v26, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v35 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 -; SI-NEXT: v_lshr_b64 v[46:47], v[22:23], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; SI-NEXT: v_or_b32_e32 v21, v21, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_or_b32_e32 v15, v15, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v44 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_or_b32_e32 v11, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_lshr_b64 v[52:53], v[6:7], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 -; SI-NEXT: v_or_b32_e32 v5, v5, v27 -; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v26 -; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v26 -; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v55, v48 -; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_mov_b32_e32 v51, v37 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_lshr_b64 v[34:35], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v35, v31 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[26:27], v[24:25], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v61 +; SI-NEXT: v_or_b32_e32 v50, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_or_b32_e32 v39, v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_or_b32_e32 v31, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v33, v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v63 +; SI-NEXT: v_lshr_b64 v[53:54], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v24, v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_lshr_b64 v[46:47], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v19, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshr_b64 v[44:45], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[49:50], 16 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v59 +; SI-NEXT: v_lshr_b64 v[58:59], v[10:11], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[0:1], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, s12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s11 +; SI-NEXT: v_mov_b32_e32 v55, s10 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s8 +; SI-NEXT: v_mov_b32_e32 v56, s7 +; SI-NEXT: v_mov_b32_e32 v57, s6 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v24, s21 +; SI-NEXT: v_mov_b32_e32 v33, s23 +; SI-NEXT: v_mov_b32_e32 v31, s25 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v39, s27 +; SI-NEXT: v_mov_b32_e32 v50, s29 +; SI-NEXT: v_mov_b32_e32 v46, s43 +; SI-NEXT: v_mov_b32_e32 v34, s42 +; SI-NEXT: v_mov_b32_e32 v44, s41 +; SI-NEXT: v_mov_b32_e32 v42, s40 +; SI-NEXT: v_mov_b32_e32 v40, s15 +; SI-NEXT: v_mov_b32_e32 v53, s14 +; SI-NEXT: v_mov_b32_e32 v51, s13 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: v_or_b32_e32 v30, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: v_or_b32_e32 v34, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_or_b32_e32 v32, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_or_b32_e32 v31, v10, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v12, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v61 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v56 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -43224,19 +40318,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v30 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v5, v32 +; SI-NEXT: v_mov_b32_e32 v7, v29 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v11, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index b42188f0f3980..9f342f95cd8ee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -6903,555 +6903,245 @@ end: define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v28i32_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8050,160 +7740,108 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 ; SI-NEXT: v_mov_b32_e32 v15, s16 ; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: v_mov_b32_e32 v17, s18 ; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_readfirstlane_b32 s41, v16 +; SI-NEXT: v_readfirstlane_b32 s45, v16 ; SI-NEXT: v_mov_b32_e32 v16, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v17 ; SI-NEXT: v_mov_b32_e32 v17, s23 ; SI-NEXT: v_readfirstlane_b32 s43, v18 ; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_readfirstlane_b32 s41, v15 ; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v16 +; SI-NEXT: v_readfirstlane_b32 s24, v16 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 ; SI-NEXT: v_readfirstlane_b32 s7, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -8218,281 +7856,201 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s44, s42, 16 -; SI-NEXT: s_lshr_b32 s45, s43, 16 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s26, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s28, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s22, 16 -; SI-NEXT: s_lshr_b32 s62, s21, 16 -; SI-NEXT: s_lshr_b32 s63, s20, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_lshr_b32 s74, s17, 16 -; SI-NEXT: s_lshr_b32 s75, s16, 16 -; SI-NEXT: s_lshr_b32 s76, s15, 16 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s13, 16 -; SI-NEXT: s_lshr_b32 s79, s12, 16 -; SI-NEXT: s_lshr_b32 s88, s11, 16 -; SI-NEXT: s_lshr_b32 s89, s10, 16 -; SI-NEXT: s_lshr_b32 s90, s8, 16 -; SI-NEXT: s_lshr_b32 s91, s7, 16 -; SI-NEXT: s_lshr_b32 s92, s6, 16 -; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v3, v46, v3 -; SI-NEXT: v_or_b32_e32 v4, v43, v4 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; SI-NEXT: v_or_b32_e32 v17, v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v25, v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: v_or_b32_e32 v10, v53, v10 -; SI-NEXT: v_or_b32_e32 v12, v51, v12 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v16, v39, v16 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v22, v33, v22 -; SI-NEXT: v_or_b32_e32 v24, v31, v24 -; SI-NEXT: v_or_b32_e32 v26, v29, v26 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s46, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v28i32_to_v56f16_scalar: @@ -9258,205 +8816,264 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -9500,145 +9117,20 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9651,10 +9143,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9663,167 +9155,152 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -9831,78 +9308,92 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -10691,664 +10182,412 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v28i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: @@ -17987,555 +17226,245 @@ end: define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v28f32_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19107,21 +18036,21 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v28, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v33, s22 -; SI-NEXT: v_mov_b32_e32 v32, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: v_mov_b32_e32 v18, s27 -; SI-NEXT: v_mov_b32_e32 v31, s28 -; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19140,338 +18069,185 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v29 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_mov_b32_e32 v55, v28 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19488,167 +18264,59 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v36 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: @@ -20500,205 +19168,264 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -20742,145 +19469,20 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -20893,10 +19495,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20905,167 +19507,152 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -21073,78 +19660,92 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -21933,664 +20534,412 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-LABEL: bitcast_v56f16_to_v28f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: @@ -28326,242 +26675,83 @@ end: define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v14i64_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -28574,7 +26764,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 @@ -28588,294 +26777,143 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29488,449 +27526,317 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 ; SI-NEXT: v_mov_b32_e32 v15, s16 ; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: v_mov_b32_e32 v17, s18 ; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_readfirstlane_b32 s45, v16 ; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v17 ; SI-NEXT: v_mov_b32_e32 v17, s23 ; SI-NEXT: v_readfirstlane_b32 s43, v18 ; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s41, v15 ; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_readfirstlane_b32 s24, v16 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v15 -; SI-NEXT: v_readfirstlane_b32 s23, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: v_readfirstlane_b32 s19, v1 -; SI-NEXT: v_readfirstlane_b32 s16, v2 -; SI-NEXT: v_readfirstlane_b32 s17, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s42, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s42, s43, 0 -; SI-NEXT: s_lshr_b32 s43, s41, 16 -; SI-NEXT: s_lshr_b32 s45, s42, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s44, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s56, s25, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s58, s26, 16 -; SI-NEXT: s_lshr_b32 s59, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s60, s22, 16 -; SI-NEXT: s_lshr_b32 s61, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s62, s20, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s72, s18, 16 -; SI-NEXT: s_lshr_b32 s73, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s74, s16, 16 -; SI-NEXT: s_lshr_b32 s75, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s76, s14, 16 -; SI-NEXT: s_lshr_b32 s77, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s78, s12, 16 -; SI-NEXT: s_lshr_b32 s79, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s88, s10, 16 -; SI-NEXT: s_lshr_b32 s89, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s90, s7, 16 -; SI-NEXT: s_lshr_b32 s91, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s92, s6, 16 -; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v3, v46, v3 -; SI-NEXT: v_or_b32_e32 v4, v43, v4 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; SI-NEXT: v_or_b32_e32 v17, v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v25, v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: v_or_b32_e32 v10, v53, v10 -; SI-NEXT: v_or_b32_e32 v12, v51, v12 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v16, v39, v16 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v22, v33, v22 -; SI-NEXT: v_or_b32_e32 v24, v31, v24 -; SI-NEXT: v_or_b32_e32 v26, v29, v26 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s46, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v14i64_to_v56f16_scalar: @@ -30696,205 +28602,264 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -30938,145 +28903,20 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -31089,10 +28929,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -31101,167 +28941,152 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -31269,78 +29094,92 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -32129,664 +29968,412 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v14i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: @@ -37601,523 +35188,229 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_add_f64 v[53:54], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: v_mov_b32_e32 v40, v27 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v40 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38694,411 +35987,166 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[53:54], v[20:21], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: v_mov_b32_e32 v40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_mov_b32_e32 v29, v13 -; SI-NEXT: v_mov_b32_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v44, v11 -; SI-NEXT: v_mov_b32_e32 v42, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v63 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39115,90 +36163,61 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: @@ -40020,205 +37039,264 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -40262,145 +37340,20 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -40413,10 +37366,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -40425,167 +37378,152 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -40593,78 +37531,92 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -41453,664 +38405,412 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-LABEL: bitcast_v56f16_to_v14f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: @@ -42799,7 +39499,67 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v56f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -42851,743 +39611,681 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v55 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v60, v1, v28 +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v58, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v58, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_mov_b32_e32 v39, v28 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_alignbit_b32 v1, v57, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v47, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v45, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v44, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v42, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v42, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v40, v1, v35 +; SI-NEXT: v_alignbit_b32 v1, v40, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v55, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v55, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v53, v1, v34 +; SI-NEXT: v_alignbit_b32 v1, v53, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v52, v1, v41 +; SI-NEXT: v_alignbit_b32 v1, v52, v31, 16 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v50, v1, v37 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v50, v32, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v48, v1, v33 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v48, v63, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v1, v38 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v3, v61, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x30000, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v33, v24 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v20, v31, v20 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v34, v18 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_or_b32_e32 v14, v62, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v60, v2, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v58, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v45, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v40, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v53, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v50, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v48, v26, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v39, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v56f16: @@ -44152,577 +40850,629 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-LABEL: bitcast_v56i16_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 +; SI-NEXT: v_writelane_b32 v28, s50, 10 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_writelane_b32 v28, s53, 13 +; SI-NEXT: v_writelane_b32 v28, s54, 14 +; SI-NEXT: v_writelane_b32 v28, s55, 15 +; SI-NEXT: v_writelane_b32 v28, s64, 16 +; SI-NEXT: v_writelane_b32 v28, s65, 17 +; SI-NEXT: v_writelane_b32 v28, s66, 18 +; SI-NEXT: v_writelane_b32 v28, s67, 19 +; SI-NEXT: v_writelane_b32 v28, s68, 20 +; SI-NEXT: v_writelane_b32 v28, s69, 21 +; SI-NEXT: v_writelane_b32 v28, s70, 22 +; SI-NEXT: v_writelane_b32 v28, s71, 23 +; SI-NEXT: v_writelane_b32 v28, s80, 24 +; SI-NEXT: v_writelane_b32 v28, s81, 25 +; SI-NEXT: v_writelane_b32 v28, s82, 26 +; SI-NEXT: v_writelane_b32 v28, s83, 27 +; SI-NEXT: v_writelane_b32 v28, s84, 28 +; SI-NEXT: v_writelane_b32 v28, s85, 29 +; SI-NEXT: v_writelane_b32 v28, s86, 30 +; SI-NEXT: v_writelane_b32 v28, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v28, s96, 32 +; SI-NEXT: s_lshr_b32 s66, s29, 16 +; SI-NEXT: s_lshr_b32 s93, s28, 16 +; SI-NEXT: s_lshr_b32 s65, s27, 16 +; SI-NEXT: s_lshr_b32 s91, s26, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s89, s24, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s22, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s20, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s73, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v29, s17, 0 +; SI-NEXT: v_writelane_b32 v28, s97, 33 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s50, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_writelane_b32 v29, s16, 1 +; SI-NEXT: v_writelane_b32 v28, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v29, s19, 2 +; SI-NEXT: v_writelane_b32 v28, s99, 35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_writelane_b32 v29, s5, 3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s84, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s98, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s86, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_readfirstlane_b32 s39, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s51, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_writelane_b32 v29, s7, 4 +; SI-NEXT: v_readfirstlane_b32 s87, v4 +; SI-NEXT: v_readfirstlane_b32 s37, v3 +; SI-NEXT: v_readfirstlane_b32 s49, v2 +; SI-NEXT: v_readfirstlane_b32 s95, v1 +; SI-NEXT: v_readfirstlane_b32 s35, v0 +; SI-NEXT: v_readfirstlane_b32 s81, v15 +; SI-NEXT: v_readfirstlane_b32 s82, v16 +; SI-NEXT: v_readfirstlane_b32 s80, v17 +; SI-NEXT: v_readfirstlane_b32 s10, v18 +; SI-NEXT: v_readfirstlane_b32 s71, v19 +; SI-NEXT: v_readfirstlane_b32 s70, v12 +; SI-NEXT: v_readfirstlane_b32 s69, v10 +; SI-NEXT: v_readfirstlane_b32 s68, v8 +; SI-NEXT: v_readfirstlane_b32 s96, v7 +; SI-NEXT: v_readfirstlane_b32 s67, v6 +; SI-NEXT: v_readfirstlane_b32 s53, v5 +; SI-NEXT: v_writelane_b32 v29, s9, 5 +; SI-NEXT: v_writelane_b32 v29, s10, 6 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_mov_b32_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v29, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v30, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_mov_b32_e32 v33, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v19, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v23, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_mov_b32_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s7, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s83, 16 +; SI-NEXT: s_lshl_b32 s72, s73, 16 +; SI-NEXT: s_mov_b32 s76, s73 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s85, 16 +; SI-NEXT: s_lshl_b32 s74, s75, 16 +; SI-NEXT: s_lshl_b32 s62, s77, 16 +; SI-NEXT: s_mov_b32 s78, s77 +; SI-NEXT: s_mov_b32 s77, s75 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s95, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s7, s69, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s39, 0xffff +; SI-NEXT: s_lshl_b32 s7, s70, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s50, 0xffff +; SI-NEXT: s_lshl_b32 s7, s71, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s97, 0xffff +; SI-NEXT: s_lshl_b32 s7, s80, 16 +; SI-NEXT: s_or_b32 s8, s4, s72 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s99, 0xffff +; SI-NEXT: s_lshl_b32 s7, s81, 16 +; SI-NEXT: s_or_b32 s6, s4, s74 +; SI-NEXT: s_lshl_b32 s12, s9, 16 +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_mov_b32 s9, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s7, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s60, s79, 16 +; SI-NEXT: s_mov_b32 s73, s76 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[62:63], 16 +; SI-NEXT: s_or_b32 s4, s4, s62 +; SI-NEXT: s_lshl_b32 s58, s89, 16 +; SI-NEXT: s_mov_b32 s77, s78 +; SI-NEXT: s_and_b32 s62, s22, 0xffff +; SI-NEXT: s_mov_b32 s88, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshl_b32 s56, s91, 16 +; SI-NEXT: s_or_b32 s62, s62, s60 +; SI-NEXT: s_mov_b32 s79, s88 +; SI-NEXT: s_and_b32 s60, s24, 0xffff +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_lshr_b64 s[88:89], s[58:59], 16 +; SI-NEXT: s_lshl_b32 s46, s93, 16 +; SI-NEXT: s_or_b32 s60, s60, s58 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: s_and_b32 s58, s26, 0xffff +; SI-NEXT: s_mov_b32 s92, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[56:57], 16 +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_mov_b32 s91, s92 +; SI-NEXT: s_and_b32 s56, s28, 0xffff +; SI-NEXT: s_mov_b32 s94, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 16 +; SI-NEXT: s_lshl_b32 s42, s96, 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_mov_b32 s93, s94 +; SI-NEXT: s_and_b32 s46, s35, 0xffff +; SI-NEXT: s_mov_b32 s30, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_mov_b32 s95, s30 +; SI-NEXT: s_and_b32 s44, s49, 0xffff +; SI-NEXT: s_mov_b32 s34, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[42:43], 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_mov_b32 s31, s34 +; SI-NEXT: s_and_b32 s42, s87, 0xffff +; SI-NEXT: s_mov_b32 s36, s35 +; SI-NEXT: s_lshr_b64 s[34:35], s[40:41], 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_mov_b32 s35, s36 +; SI-NEXT: s_and_b32 s40, s51, 0xffff +; SI-NEXT: s_mov_b32 s38, s37 +; SI-NEXT: s_lshr_b64 s[36:37], s[14:15], 16 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_mov_b32 s37, s38 +; SI-NEXT: s_and_b32 s14, s86, 0xffff +; SI-NEXT: s_mov_b32 s48, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 16 +; SI-NEXT: s_lshl_b32 vcc_lo, s82, 16 +; SI-NEXT: s_or_b32 s14, s14, s12 +; SI-NEXT: s_mov_b32 s39, s48 +; SI-NEXT: s_and_b32 s12, s98, 0xffff +; SI-NEXT: s_mov_b32 s16, s82 +; SI-NEXT: s_mov_b32 s82, s99 +; SI-NEXT: s_mov_b32 s99, s98 +; SI-NEXT: s_mov_b32 s98, s96 +; SI-NEXT: s_mov_b32 s96, s86 +; SI-NEXT: s_mov_b32 s86, s50 +; SI-NEXT: s_mov_b32 s50, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[10:11], 16 +; SI-NEXT: s_or_b32 s12, s12, s10 +; SI-NEXT: s_mov_b32 s49, s50 +; SI-NEXT: s_mov_b32 s50, s86 +; SI-NEXT: s_mov_b32 s86, s96 +; SI-NEXT: s_and_b32 s10, s84, 0xffff +; SI-NEXT: s_mov_b32 s96, s53 +; SI-NEXT: s_lshr_b64 s[52:53], vcc, 16 +; SI-NEXT: s_mov_b32 s5, s63 +; SI-NEXT: s_mov_b32 s63, s61 +; SI-NEXT: s_mov_b32 s61, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_mov_b32 s15, s13 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_or_b32 s10, s10, vcc_lo +; SI-NEXT: s_mov_b32 s11, vcc_hi +; SI-NEXT: s_mov_b32 s53, s96 +; SI-NEXT: s_mov_b32 s96, s98 +; SI-NEXT: s_mov_b32 s98, s99 +; SI-NEXT: s_mov_b32 s99, s82 +; SI-NEXT: s_mov_b32 s82, s16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_and_b32 s4, s84, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s99, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s99, s98, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 6 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s97, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s86, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s50, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s51, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s39, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s87, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s31, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s49, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s37, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s35, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s95, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s60, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s61, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s62, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s63, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s54, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v29, 2 +; SI-NEXT: s_add_i32 s19, s7, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v29, 1 +; SI-NEXT: s_add_i32 s16, s8, 3 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s73, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v29, 0 +; SI-NEXT: s_add_i32 s17, s9, 3 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s83, 16 +; SI-NEXT: s_or_b32 s9, s16, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_lshr_b64 s[72:73], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b32 s83, s9, 16 +; SI-NEXT: s_lshr_b32 s85, s7, 16 +; SI-NEXT: s_lshr_b32 s54, s5, 16 +; SI-NEXT: s_lshr_b32 s55, s63, 16 +; SI-NEXT: s_lshr_b32 s64, s61, 16 +; SI-NEXT: s_lshr_b32 s65, s59, 16 +; SI-NEXT: s_lshr_b32 s66, s57, 16 +; SI-NEXT: s_lshr_b32 s67, s47, 16 +; SI-NEXT: s_lshr_b32 s68, s45, 16 +; SI-NEXT: s_lshr_b32 s69, s43, 16 +; SI-NEXT: s_lshr_b32 s70, s41, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 16 +; SI-NEXT: s_lshr_b32 s80, s13, 16 +; SI-NEXT: s_lshr_b32 s81, s11, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v46 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v61 -; SI-NEXT: v_or_b32_e32 v19, v23, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v63 -; SI-NEXT: v_or_b32_e32 v21, v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_or_b32_e32 v22, v28, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 -; SI-NEXT: v_or_b32_e32 v23, v30, v23 -; SI-NEXT: v_or_b32_e32 v24, v28, v24 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s83, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s85, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s54, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s62, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s63, 0xffff +; SI-NEXT: s_lshl_b32 s18, s55, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s60, 0xffff +; SI-NEXT: s_lshl_b32 s19, s88, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s61, 0xffff +; SI-NEXT: s_lshl_b32 s20, s64, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s58, 0xffff +; SI-NEXT: s_lshl_b32 s21, s90, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s59, 0xffff +; SI-NEXT: s_lshl_b32 s22, s65, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s56, 0xffff +; SI-NEXT: s_lshl_b32 s23, s92, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s57, 0xffff +; SI-NEXT: s_lshl_b32 s24, s66, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s46, 0xffff +; SI-NEXT: s_lshl_b32 s25, s94, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s47, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s44, 0xffff +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s28, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s34, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s70, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s71, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s80, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s81, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v22, s14 +; SI-NEXT: v_mov_b32_e32 v23, s15 +; SI-NEXT: v_mov_b32_e32 v24, s12 +; SI-NEXT: v_mov_b32_e32 v25, s13 +; SI-NEXT: v_mov_b32_e32 v26, s10 +; SI-NEXT: v_mov_b32_e32 v27, s11 +; SI-NEXT: v_readlane_b32 s99, v28, 35 +; SI-NEXT: v_readlane_b32 s98, v28, 34 +; SI-NEXT: v_readlane_b32 s97, v28, 33 +; SI-NEXT: v_readlane_b32 s96, v28, 32 +; SI-NEXT: v_readlane_b32 s87, v28, 31 +; SI-NEXT: v_readlane_b32 s86, v28, 30 +; SI-NEXT: v_readlane_b32 s85, v28, 29 +; SI-NEXT: v_readlane_b32 s84, v28, 28 +; SI-NEXT: v_readlane_b32 s83, v28, 27 +; SI-NEXT: v_readlane_b32 s82, v28, 26 +; SI-NEXT: v_readlane_b32 s81, v28, 25 +; SI-NEXT: v_readlane_b32 s80, v28, 24 +; SI-NEXT: v_readlane_b32 s71, v28, 23 +; SI-NEXT: v_readlane_b32 s70, v28, 22 +; SI-NEXT: v_readlane_b32 s69, v28, 21 +; SI-NEXT: v_readlane_b32 s68, v28, 20 +; SI-NEXT: v_readlane_b32 s67, v28, 19 +; SI-NEXT: v_readlane_b32 s66, v28, 18 +; SI-NEXT: v_readlane_b32 s65, v28, 17 +; SI-NEXT: v_readlane_b32 s64, v28, 16 +; SI-NEXT: v_readlane_b32 s55, v28, 15 +; SI-NEXT: v_readlane_b32 s54, v28, 14 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v33, v41 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v30, v19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v29, v18 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v28, v17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v19, v16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v17, v26 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v23, v25 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v16, v24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v56i16_to_v56f16_scalar: @@ -45611,10 +42361,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -45628,145 +42374,37 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -45811,258 +42449,260 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v29, v29, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v30, v30, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v23, v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v24, v24, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v31, v31, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v34, v34, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v36, v36, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_or_b32_e32 v12, v12, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_or_b32_e32 v37, v37, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v39, v39, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_or_b32_e32 v50, v50, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v56 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 ; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v48, v48, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 ; SI-NEXT: v_or_b32_e32 v8, v8, v43 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v38, v38, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v58 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 ; SI-NEXT: v_or_b32_e32 v16, v16, v55 -; SI-NEXT: v_or_b32_e32 v35, v35, v59 +; SI-NEXT: v_or_b32_e32 v18, v18, v59 ; SI-NEXT: v_or_b32_e32 v20, v20, v53 ; SI-NEXT: v_or_b32_e32 v22, v22, v52 -; SI-NEXT: v_or_b32_e32 v32, v32, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v60 ; SI-NEXT: v_or_b32_e32 v26, v26, v51 -; SI-NEXT: v_alignbit_b32 v56, v2, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v50, v46, 16 -; SI-NEXT: v_alignbit_b32 v46, v6, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v39, v57, 16 -; SI-NEXT: v_alignbit_b32 v44, v37, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v12, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v36, v58, 16 -; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v18, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v33, v59, 16 -; SI-NEXT: v_alignbit_b32 v54, v31, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v30, v60, 16 -; SI-NEXT: v_alignbit_b32 v51, v29, v51, 16 +; SI-NEXT: v_alignbit_b32 v56, v1, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v3, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v7, v57, 16 +; SI-NEXT: v_alignbit_b32 v44, v9, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v13, v58, 16 +; SI-NEXT: v_alignbit_b32 v41, v15, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v23, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v25, v60, 16 +; SI-NEXT: v_alignbit_b32 v51, v27, v51, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 ; SI-NEXT: v_or_b32_e32 v2, v2, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v28 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 ; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v28, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 ; SI-NEXT: v_or_b32_e32 v12, v12, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v28, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v28 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v41 ; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload @@ -46079,40 +42719,40 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 ; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v28 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 ; SI-NEXT: v_or_b32_e32 v20, v20, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v28, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 ; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -46679,15 +43319,28 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v25, v10 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s28, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -46704,521 +43357,412 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_4 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v24 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 -; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v52, v0, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_or_b32_e32 v34, v26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v20, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v41, v4, v53 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_or_b32_e32 v19, v8, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v18, v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v31, v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 +; SI-NEXT: v_or_b32_e32 v30, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v26, v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v23, v17, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v26, v29, v2 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v29, v4 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v10 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v26, v26, v6 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v30, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v61 +; SI-NEXT: v_or_b32_e32 v54, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v38, v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; SI-NEXT: v_or_b32_e32 v36, v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v59 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v28, v21, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v60 +; SI-NEXT: v_lshr_b64 v[45:46], v[27:28], 16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v34, v17, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_lshr_b64 v[57:58], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[55:56], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v41 +; SI-NEXT: v_lshr_b64 v[41:42], v[37:38], 16 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_lshr_b64 v[47:48], v[33:34], 16 +; SI-NEXT: v_mov_b32_e32 v46, v50 +; SI-NEXT: v_lshr_b64 v[43:44], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_lshr_b64 v[39:40], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v40, v25 +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_lshr_b64 v[48:49], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v14, v52 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, s12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, s10 +; SI-NEXT: v_mov_b32_e32 v59, s9 +; SI-NEXT: v_mov_b32_e32 v60, s8 +; SI-NEXT: v_mov_b32_e32 v32, s7 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v34, s21 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v36, s25 +; SI-NEXT: v_mov_b32_e32 v38, s27 +; SI-NEXT: v_mov_b32_e32 v54, s29 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v57, s43 +; SI-NEXT: v_mov_b32_e32 v55, s42 +; SI-NEXT: v_mov_b32_e32 v47, s41 +; SI-NEXT: v_mov_b32_e32 v45, s40 +; SI-NEXT: v_mov_b32_e32 v43, s15 +; SI-NEXT: v_mov_b32_e32 v41, s14 +; SI-NEXT: v_mov_b32_e32 v39, s13 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v16 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v26, v26, v12 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v30, v14 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v48, v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 -; SI-NEXT: v_or_b32_e32 v36, v30, v20 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v38, v26, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v28, v24 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v31, v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v25, v25, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v56 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v49 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_or_b32_e32 v19, v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v62 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v17, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_or_b32_e32 v15, v15, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_or_b32_e32 v11, v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v28 -; SI-NEXT: v_lshr_b64 v[52:53], v[10:11], 16 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 -; SI-NEXT: v_or_b32_e32 v9, v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 -; SI-NEXT: v_lshr_b64 v[40:41], v[6:7], 16 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v60 -; SI-NEXT: v_or_b32_e32 v5, v5, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 -; SI-NEXT: v_lshr_b64 v[42:43], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[8:9], 16 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_lshr_b64 v[44:45], v[2:3], 16 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v28 -; SI-NEXT: v_lshr_b64 v[28:29], v[26:27], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v45, v37 -; SI-NEXT: v_mov_b32_e32 v43, v39 -; SI-NEXT: v_mov_b32_e32 v55, v38 -; SI-NEXT: v_mov_b32_e32 v53, v36 -; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v35, v31 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_lshr_b64 v[30:31], v[24:25], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_or_b32_e32 v37, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v35, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_or_b32_e32 v28, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v34, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v62 +; SI-NEXT: v_or_b32_e32 v36, v12, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v62 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v61 +; SI-NEXT: v_or_b32_e32 v38, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -47235,24 +43779,17 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v3, v37 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v7, v28 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v11, v36 +; SI-NEXT: v_mov_b32_e32 v13, v38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v56f16_to_v56i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index c4d17c79d773e..4fe874215b3f8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -7374,617 +7374,265 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8633,174 +8281,117 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 ; SI-NEXT: v_mov_b32_e32 v17, s16 ; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v17 ; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s43, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v17 ; SI-NEXT: v_mov_b32_e32 v17, s22 -; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_readfirstlane_b32 s42, v18 ; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v0 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s19, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_readfirstlane_b32 s17, v4 -; SI-NEXT: v_readfirstlane_b32 s16, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 ; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v14 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -8815,303 +8406,216 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s46, s42, 16 -; SI-NEXT: s_lshr_b32 s47, s43, 16 -; SI-NEXT: s_lshr_b32 s56, s44, 16 -; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s26, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_lshr_b32 s62, s28, 16 -; SI-NEXT: s_lshr_b32 s63, s29, 16 -; SI-NEXT: s_lshr_b32 s72, s23, 16 -; SI-NEXT: s_lshr_b32 s73, s22, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s20, 16 -; SI-NEXT: s_lshr_b32 s76, s19, 16 -; SI-NEXT: s_lshr_b32 s77, s18, 16 -; SI-NEXT: s_lshr_b32 s78, s17, 16 -; SI-NEXT: s_lshr_b32 s79, s16, 16 -; SI-NEXT: s_lshr_b32 s88, s15, 16 -; SI-NEXT: s_lshr_b32 s89, s14, 16 -; SI-NEXT: s_lshr_b32 s90, s13, 16 -; SI-NEXT: s_lshr_b32 s91, s12, 16 -; SI-NEXT: s_lshr_b32 s92, s11, 16 -; SI-NEXT: s_lshr_b32 s93, s10, 16 -; SI-NEXT: s_lshr_b32 s94, s8, 16 -; SI-NEXT: s_lshr_b32 s95, s7, 16 -; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v13, v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_or_b32_e32 v17, v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v38, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v12, v55, v12 -; SI-NEXT: v_or_b32_e32 v14, v53, v14 -; SI-NEXT: v_or_b32_e32 v16, v51, v16 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v39, v20 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s56, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: @@ -9945,236 +9449,284 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -10226,157 +9778,20 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10389,207 +9804,182 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -10597,74 +9987,95 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -11511,84 +10922,37 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v30i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -11605,576 +10969,373 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -12200,27 +11361,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v60f16_to_v30i32_scalar: @@ -19324,617 +18465,265 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20554,21 +19343,21 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v31, s18 -; SI-NEXT: v_mov_b32_e32 v48, s19 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v49, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v37, s23 -; SI-NEXT: v_mov_b32_e32 v36, s24 -; SI-NEXT: v_mov_b32_e32 v35, s25 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_mov_b32_e32 v34, s27 ; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_mov_b32_e32 v33, s29 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -20587,475 +19376,206 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v39 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v46, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[30:31], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v62 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v57 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21072,102 +19592,59 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v15, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: @@ -22087,316 +20564,156 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -22404,121 +20721,192 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22531,207 +20919,182 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -22739,74 +21102,95 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -23653,84 +22037,37 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-LABEL: bitcast_v60f16_to_v30f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23747,576 +22084,373 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -24342,27 +22476,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v60f16_to_v30f32_scalar: @@ -30528,265 +28642,89 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -30795,7 +28733,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 @@ -30814,330 +28751,156 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31802,485 +29565,341 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 ; SI-NEXT: v_mov_b32_e32 v17, s16 ; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v17 ; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s44, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v17 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v18 ; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s46, v18 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v17 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_readfirstlane_b32 s21, v1 -; SI-NEXT: v_readfirstlane_b32 s18, v2 -; SI-NEXT: v_readfirstlane_b32 s19, v3 -; SI-NEXT: v_readfirstlane_b32 s16, v4 -; SI-NEXT: v_readfirstlane_b32 s17, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v14 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s43, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s43, s44, 0 -; SI-NEXT: s_lshr_b32 s44, s41, 16 -; SI-NEXT: s_lshr_b32 s47, s43, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_lshr_b32 s56, s42, 16 -; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s60, s25, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s62, s26, 16 -; SI-NEXT: s_lshr_b32 s63, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s72, s22, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s74, s20, 16 -; SI-NEXT: s_lshr_b32 s75, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s76, s18, 16 -; SI-NEXT: s_lshr_b32 s77, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s78, s16, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s88, s14, 16 -; SI-NEXT: s_lshr_b32 s89, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s90, s12, 16 -; SI-NEXT: s_lshr_b32 s91, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s92, s10, 16 -; SI-NEXT: s_lshr_b32 s93, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s94, s7, 16 -; SI-NEXT: s_lshr_b32 s95, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v13, v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_or_b32_e32 v17, v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v38, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v12, v55, v12 -; SI-NEXT: v_or_b32_e32 v14, v53, v14 -; SI-NEXT: v_or_b32_e32 v16, v51, v16 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v39, v20 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s56, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: @@ -33114,236 +30733,284 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -33395,157 +31062,20 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33558,207 +31088,182 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -33766,74 +31271,95 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -34680,84 +32206,37 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v15i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -34774,576 +32253,373 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -35369,27 +32645,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v60f16_to_v15i64_scalar: @@ -40584,273 +37840,96 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v57 -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_mov_b32_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 -; SI-NEXT: v_add_f64 v[30:31], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -40858,314 +37937,154 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v62 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15f64_to_v60f16: @@ -41754,14 +38673,14 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 ; SI-NEXT: v_mov_b32_e32 v28, s18 ; SI-NEXT: v_mov_b32_e32 v29, s19 ; SI-NEXT: v_mov_b32_e32 v26, s20 ; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_mov_b32_e32 v21, s25 ; SI-NEXT: v_mov_b32_e32 v18, s26 @@ -41787,463 +38706,191 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_add_f64 v[39:40], v[22:23], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_mov_b32_e32 v56, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_add_f64 v[41:42], v[26:27], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: v_mov_b32_e32 v59, v13 -; SI-NEXT: v_mov_b32_e32 v58, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v60 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -42260,100 +38907,59 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_mov_b32_e32 v15, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: @@ -43243,316 +39849,156 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -43560,121 +40006,192 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -43687,207 +40204,182 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -43895,74 +40387,95 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -44809,84 +41322,37 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-LABEL: bitcast_v60f16_to_v15f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -44903,576 +41369,373 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -45498,27 +41761,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v60f16_to_v15f64_scalar: @@ -46254,6 +42497,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -46270,834 +42514,805 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v52 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v51 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v57 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v43 +; SI-NEXT: v_or_b32_e32 v45, v1, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v63, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v63, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v61, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v59, v1, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v57, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v57, v53, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v56, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v56, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v46, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v44, v1, v9 +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v42, v1, v47 +; SI-NEXT: v_alignbit_b32 v1, v42, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v41, v1, v49 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v55, v1, v30 +; SI-NEXT: v_alignbit_b32 v1, v55, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v1, v58 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_alignbit_b32 v1, v9, v37, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v1, v62, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v48, v1, v51 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_alignbit_b32 v1, v48, v38, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v24, v62, v24 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v20, v60, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_or_b32_e32 v18, v36, v18 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v16, v35, v16 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_or_b32_e32 v28, v52, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v45, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v63, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v61, v6, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v59, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v56, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v46, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v54, v25, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v24, v26, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v48, v29, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v39, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -47114,14 +43329,39 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v60f16: @@ -47730,656 +43970,716 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-LABEL: bitcast_v60i16_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 +; SI-NEXT: v_writelane_b32 v30, s51, 11 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 +; SI-NEXT: v_writelane_b32 v30, s65, 17 +; SI-NEXT: v_writelane_b32 v30, s66, 18 +; SI-NEXT: v_writelane_b32 v30, s67, 19 +; SI-NEXT: v_writelane_b32 v30, s68, 20 +; SI-NEXT: v_writelane_b32 v30, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v30, s70, 22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v31, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v30, s71, 23 +; SI-NEXT: v_writelane_b32 v31, s4, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_writelane_b32 v30, s80, 24 +; SI-NEXT: v_writelane_b32 v31, s4, 2 +; SI-NEXT: v_writelane_b32 v30, s81, 25 +; SI-NEXT: v_writelane_b32 v31, s29, 3 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v30, s82, 26 +; SI-NEXT: v_writelane_b32 v31, s4, 4 +; SI-NEXT: v_writelane_b32 v30, s83, 27 +; SI-NEXT: v_writelane_b32 v31, s27, 5 +; SI-NEXT: v_writelane_b32 v30, s84, 28 +; SI-NEXT: v_writelane_b32 v31, s25, 6 +; SI-NEXT: v_writelane_b32 v30, s85, 29 +; SI-NEXT: v_writelane_b32 v31, s23, 7 +; SI-NEXT: v_writelane_b32 v30, s86, 30 +; SI-NEXT: v_writelane_b32 v31, s21, 8 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_writelane_b32 v30, s87, 31 +; SI-NEXT: v_writelane_b32 v31, s4, 9 +; SI-NEXT: v_writelane_b32 v30, s96, 32 +; SI-NEXT: v_writelane_b32 v31, s16, 10 +; SI-NEXT: v_writelane_b32 v30, s97, 33 +; SI-NEXT: s_mov_b32 s59, s20 +; SI-NEXT: v_writelane_b32 v31, s18, 11 +; SI-NEXT: v_writelane_b32 v30, s98, 34 +; SI-NEXT: s_mov_b32 s98, s22 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_readfirstlane_b32 s85, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s96, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s77, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s39, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s97, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s51, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s87, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s75, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s49, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: s_lshr_b32 s80, s29, 16 +; SI-NEXT: s_lshr_b32 s69, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s26, 16 +; SI-NEXT: s_lshr_b32 s68, s25, 16 +; SI-NEXT: s_lshr_b32 s38, s24, 16 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s36, s22, 16 +; SI-NEXT: s_lshr_b32 s66, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s19, 16 +; SI-NEXT: s_lshr_b32 s84, s18, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_writelane_b32 v31, s59, 12 +; SI-NEXT: v_writelane_b32 v30, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s29, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s37, v1 +; SI-NEXT: v_readfirstlane_b32 s55, v0 +; SI-NEXT: v_readfirstlane_b32 s86, v17 +; SI-NEXT: v_readfirstlane_b32 s58, v18 +; SI-NEXT: v_readfirstlane_b32 s30, v19 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s83, v14 +; SI-NEXT: v_readfirstlane_b32 s31, v13 +; SI-NEXT: v_readfirstlane_b32 s82, v12 +; SI-NEXT: v_readfirstlane_b32 s91, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s79, v9 +; SI-NEXT: v_readfirstlane_b32 s99, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v6 +; SI-NEXT: v_readfirstlane_b32 s89, v5 +; SI-NEXT: v_readfirstlane_b32 s90, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_writelane_b32 v31, s98, 13 +; SI-NEXT: v_writelane_b32 v31, s58, 14 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 -; SI-NEXT: v_mov_b32_e32 v35, v21 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v21 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v37, v23 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 -; SI-NEXT: v_mov_b32_e32 v49, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 -; SI-NEXT: v_mov_b32_e32 v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s69, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s80, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 2 +; SI-NEXT: v_readlane_b32 s4, v31, 9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s71, 16 +; SI-NEXT: s_lshl_b32 s42, s4, 16 +; SI-NEXT: v_readlane_b32 s4, v31, 4 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 0 +; SI-NEXT: v_writelane_b32 v31, s21, 21 +; SI-NEXT: v_writelane_b32 v31, s77, 22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s56, s99, 16 +; SI-NEXT: v_writelane_b32 v31, s90, 23 +; SI-NEXT: s_or_b32 s5, s5, s56 +; SI-NEXT: s_and_b32 s56, s75, 0xffff +; SI-NEXT: s_lshl_b32 s57, s81, 16 +; SI-NEXT: v_writelane_b32 v31, s89, 24 +; SI-NEXT: s_or_b32 vcc_hi, s56, s57 +; SI-NEXT: s_and_b32 s56, s51, 0xffff +; SI-NEXT: s_lshl_b32 s57, s82, 16 +; SI-NEXT: v_writelane_b32 v31, s99, 25 +; SI-NEXT: v_writelane_b32 v31, s97, 26 +; SI-NEXT: s_mov_b32 s97, s49 +; SI-NEXT: s_or_b32 s49, s56, s57 +; SI-NEXT: s_and_b32 s56, s39, 0xffff +; SI-NEXT: s_lshl_b32 s57, s83, 16 +; SI-NEXT: v_writelane_b32 v31, s51, 27 +; SI-NEXT: s_or_b32 s51, s56, s57 +; SI-NEXT: s_and_b32 s56, s96, 0xffff +; SI-NEXT: s_lshl_b32 s57, s30, 16 +; SI-NEXT: s_or_b32 s53, s56, s57 +; SI-NEXT: s_and_b32 s56, s78, 0xffff +; SI-NEXT: s_lshl_b32 s57, s86, 16 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_mov_b32 s27, s67 +; SI-NEXT: s_mov_b32 s67, s66 +; SI-NEXT: s_mov_b32 s66, s65 +; SI-NEXT: s_mov_b32 s65, s64 +; SI-NEXT: s_mov_b32 s64, s55 +; SI-NEXT: s_or_b32 s55, s56, s57 +; SI-NEXT: s_and_b32 s56, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s84, 16 +; SI-NEXT: s_mov_b32 s70, s69 +; SI-NEXT: s_mov_b32 s74, s68 +; SI-NEXT: s_or_b32 s60, s56, s46 +; SI-NEXT: s_lshr_b64 s[68:69], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s21, 16 +; SI-NEXT: s_or_b32 s72, s46, s44 +; SI-NEXT: s_lshr_b64 s[20:21], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s59, 0xffff +; SI-NEXT: s_lshl_b32 s40, s36, 16 +; SI-NEXT: s_lshl_b32 s10, s4, 16 +; SI-NEXT: s_lshl_b32 s4, s23, 16 +; SI-NEXT: s_mov_b32 s90, s23 +; SI-NEXT: s_or_b32 s62, s44, s42 +; SI-NEXT: s_lshr_b64 s[22:23], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s98, 0xffff +; SI-NEXT: s_lshl_b32 s14, s38, 16 +; SI-NEXT: s_lshl_b32 s52, s35, 16 +; SI-NEXT: s_mov_b32 s25, s17 +; SI-NEXT: s_mov_b32 s17, s35 +; SI-NEXT: s_lshl_b32 s54, s58, 16 +; SI-NEXT: s_or_b32 s58, s42, s40 +; SI-NEXT: s_lshr_b64 s[34:35], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s24, 0xffff +; SI-NEXT: s_lshl_b32 s12, s95, 16 +; SI-NEXT: s_or_b32 s56, s40, s14 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s89, 16 +; SI-NEXT: s_or_b32 s46, s14, s12 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff +; SI-NEXT: s_or_b32 s44, s12, s10 +; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s64, 0xffff +; SI-NEXT: s_or_b32 s42, s10, s8 +; SI-NEXT: s_lshr_b64 s[98:99], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s29, 0xffff +; SI-NEXT: s_mov_b32 s59, s41 +; SI-NEXT: s_or_b32 s40, s8, s6 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v31, s6, 15 +; SI-NEXT: v_writelane_b32 v31, s7, 16 +; SI-NEXT: v_readlane_b32 s6, v31, 1 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_or_b32 s14, s6, s4 +; SI-NEXT: s_mov_b32 s15, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_lshl_b32 vcc_lo, s79, 16 +; SI-NEXT: v_writelane_b32 v31, s4, 17 +; SI-NEXT: v_writelane_b32 v31, s5, 18 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], vcc, 16 +; SI-NEXT: s_lshl_b32 s48, s91, 16 +; SI-NEXT: s_or_b32 s12, s4, vcc_lo +; SI-NEXT: v_writelane_b32 v31, s6, 19 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_mov_b32 s73, s45 +; SI-NEXT: s_mov_b32 s45, s11 +; SI-NEXT: v_writelane_b32 v31, s7, 20 +; SI-NEXT: s_or_b32 s10, s4, s48 +; SI-NEXT: s_mov_b32 s11, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 +; SI-NEXT: s_mov_b32 s49, s97 +; SI-NEXT: v_readlane_b32 s97, v31, 26 +; SI-NEXT: s_lshl_b32 s50, s31, 16 +; SI-NEXT: v_readlane_b32 s77, v31, 22 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_or_b32 s8, s4, s50 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s52 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_mov_b32 s63, s43 +; SI-NEXT: s_mov_b32 s43, s9 +; SI-NEXT: s_mov_b32 s9, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_or_b32 s4, s4, s54 +; SI-NEXT: s_mov_b32 s5, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_mov_b32 s61, s47 +; SI-NEXT: s_mov_b32 s47, s13 +; SI-NEXT: s_mov_b32 s16, s29 +; SI-NEXT: s_mov_b32 s13, vcc_hi +; SI-NEXT: s_mov_b32 s23, s90 +; SI-NEXT: v_readlane_b32 s99, v31, 25 +; SI-NEXT: v_readlane_b32 s89, v31, 24 +; SI-NEXT: v_readlane_b32 s90, v31, 23 +; SI-NEXT: v_readlane_b32 s21, v31, 21 +; SI-NEXT: v_readlane_b32 s51, v31, 27 +; SI-NEXT: s_mov_b32 s7, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s18, s68 +; SI-NEXT: s_mov_b32 s55, s64 +; SI-NEXT: s_mov_b32 s64, s65 +; SI-NEXT: s_mov_b32 s65, s66 +; SI-NEXT: s_mov_b32 s66, s67 +; SI-NEXT: s_mov_b32 s67, s27 +; SI-NEXT: s_mov_b32 s68, s74 +; SI-NEXT: s_mov_b32 s69, s70 +; SI-NEXT: s_mov_b32 s35, s17 +; SI-NEXT: s_mov_b32 s17, s25 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: s_add_i32 s4, s85, 3 +; SI-NEXT: v_readlane_b32 s5, v31, 14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s78, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s86, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s77, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s7, s96, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s30, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s97, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s39, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s83, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s10, s87, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s11, s91, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s51, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s82, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s12, s49, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s75, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s81, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v31, 1 ; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s15, s23, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v31, 0 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s99, 16 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: s_add_i32 s40, s16, 3 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s41, s89, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 2 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_add_i32 s41, s16, 3 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s71, 16 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: s_add_i32 s42, s55, 3 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s43, s21, 16 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_add_i32 s43, s37, 3 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s90, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_readlane_b32 s16, v31, 4 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 3 +; SI-NEXT: s_or_b32 s28, s44, s28 +; SI-NEXT: s_add_i32 s29, s16, 3 +; SI-NEXT: s_add_i32 s44, s28, 0x30000 +; SI-NEXT: s_and_b32 s28, s29, 0xffff +; SI-NEXT: s_lshl_b32 s29, s80, 16 +; SI-NEXT: s_or_b32 s28, s29, s28 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s45, s28, 0x30000 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s28, s95, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 5 +; SI-NEXT: s_or_b32 s26, s28, s26 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: s_add_i32 s46, s26, 0x30000 +; SI-NEXT: s_and_b32 s26, s27, 0xffff +; SI-NEXT: s_lshl_b32 s27, s69, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s47, s26, 0x30000 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 6 +; SI-NEXT: s_or_b32 s24, s26, s24 +; SI-NEXT: s_add_i32 s25, s16, 3 +; SI-NEXT: s_add_i32 s56, s24, 0x30000 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s68, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 13 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_add_i32 s22, s16, 3 +; SI-NEXT: s_add_i32 s57, s24, 0x30000 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s36, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 7 +; SI-NEXT: s_or_b32 s22, s24, s22 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: s_add_i32 s58, s22, 0x30000 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s67, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 12 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v31, 9 +; SI-NEXT: s_add_i32 s59, s22, 0x30000 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 8 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s16, 3 +; SI-NEXT: s_add_i32 s62, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s66, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 11 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_add_i32 s63, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s84, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s72, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s65, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 10 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s73, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: s_add_i32 s40, s40, 0x30000 +; SI-NEXT: s_add_i32 s41, s41, 0x30000 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v31, s16, 15 +; SI-NEXT: v_writelane_b32 v31, s17, 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s42, s42, 0x30000 +; SI-NEXT: s_add_i32 s43, s43, 0x30000 +; SI-NEXT: v_writelane_b32 v31, s16, 17 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[98:99], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v31, s17, 18 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v31, s16, 19 +; SI-NEXT: s_lshr_b64 s[48:49], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s64, s61, 16 +; SI-NEXT: s_lshr_b32 s65, s73, 16 +; SI-NEXT: s_lshr_b32 s66, s63, 16 +; SI-NEXT: s_lshr_b32 s67, s59, 16 +; SI-NEXT: s_lshr_b32 s68, s57, 16 +; SI-NEXT: s_lshr_b32 s69, s47, 16 +; SI-NEXT: s_lshr_b32 s80, s45, 16 +; SI-NEXT: s_lshr_b32 s90, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s99, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s13, 16 +; SI-NEXT: s_lshr_b32 s82, s11, 16 +; SI-NEXT: s_lshr_b32 s83, s9, 16 +; SI-NEXT: s_lshr_b32 s30, s7, 16 +; SI-NEXT: s_lshr_b32 s86, s5, 16 +; SI-NEXT: v_writelane_b32 v31, s17, 20 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v59 -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 -; SI-NEXT: v_or_b32_e32 v19, v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_or_b32_e32 v20, v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 -; SI-NEXT: v_or_b32_e32 v24, v32, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v32, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_or_b32_e32 v27, v33, v27 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xffff +; SI-NEXT: s_lshl_b32 s18, s64, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s72, 0xffff +; SI-NEXT: s_lshl_b32 s19, s20, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s73, 0xffff +; SI-NEXT: s_lshl_b32 s20, s65, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s62, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s63, 0xffff +; SI-NEXT: s_lshl_b32 s22, s66, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s58, 0xffff +; SI-NEXT: s_lshl_b32 s23, s34, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s59, 0xffff +; SI-NEXT: s_lshl_b32 s24, s67, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s56, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s57, 0xffff +; SI-NEXT: s_lshl_b32 s26, s68, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s46, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s28, s69, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s80, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s44, s98, 16 +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 15 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: v_readlane_b32 s45, v31, 16 +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 17 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: v_readlane_b32 s45, v31, 18 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s99, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 19 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s81, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s82, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s83, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s6, s6, s44 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s44, s30, 16 +; SI-NEXT: s_or_b32 s7, s7, s44 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s44, s54, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s44, s86, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: v_readlane_b32 s45, v31, 20 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s42 +; SI-NEXT: v_mov_b32_e32 v15, s43 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v17, s41 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s99, v30, 35 +; SI-NEXT: v_readlane_b32 s98, v30, 34 +; SI-NEXT: v_readlane_b32 s97, v30, 33 +; SI-NEXT: v_readlane_b32 s96, v30, 32 +; SI-NEXT: v_readlane_b32 s87, v30, 31 +; SI-NEXT: v_readlane_b32 s86, v30, 30 +; SI-NEXT: v_readlane_b32 s85, v30, 29 +; SI-NEXT: v_readlane_b32 s84, v30, 28 +; SI-NEXT: v_readlane_b32 s83, v30, 27 +; SI-NEXT: v_readlane_b32 s82, v30, 26 +; SI-NEXT: v_readlane_b32 s81, v30, 25 +; SI-NEXT: v_readlane_b32 s80, v30, 24 +; SI-NEXT: v_readlane_b32 s71, v30, 23 +; SI-NEXT: v_readlane_b32 s70, v30, 22 +; SI-NEXT: v_readlane_b32 s69, v30, 21 +; SI-NEXT: v_readlane_b32 s68, v30, 20 +; SI-NEXT: v_readlane_b32 s67, v30, 19 +; SI-NEXT: v_readlane_b32 s66, v30, 18 +; SI-NEXT: v_readlane_b32 s65, v30, 17 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v52, v33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v49, v27 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: v_mov_b32_e32 v37, v23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v35, v21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: v_mov_b32_e32 v33, v19 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s16, s29 +; SI-NEXT: v_writelane_b32 v31, s4, 15 +; SI-NEXT: v_writelane_b32 v31, s5, 16 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v31, s4, 17 +; SI-NEXT: v_writelane_b32 v31, s5, 18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v31, s4, 19 +; SI-NEXT: v_writelane_b32 v31, s5, 20 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v60i16_to_v60f16_scalar: @@ -49334,265 +45634,150 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v50 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v51 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_or_b32_e32 v29, v29, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_or_b32_e32 v25, v25, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_or_b32_e32 v21, v21, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 @@ -49601,51 +45786,51 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_or_b32_e32 v17, v17, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_or_b32_e32 v13, v13, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 @@ -49654,68 +45839,68 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_or_b32_e32 v9, v9, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_or_b32_e32 v7, v7, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v5, v5, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v42 ; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 @@ -49724,148 +45909,147 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_or_b32_e32 v54, v54, v58 -; SI-NEXT: v_or_b32_e32 v53, v53, v57 -; SI-NEXT: v_or_b32_e32 v51, v51, v56 -; SI-NEXT: v_or_b32_e32 v50, v50, v47 -; SI-NEXT: v_or_b32_e32 v49, v49, v46 -; SI-NEXT: v_or_b32_e32 v39, v39, v45 -; SI-NEXT: v_or_b32_e32 v38, v38, v44 -; SI-NEXT: v_or_b32_e32 v37, v37, v43 -; SI-NEXT: v_or_b32_e32 v36, v36, v42 -; SI-NEXT: v_or_b32_e32 v35, v35, v41 -; SI-NEXT: v_or_b32_e32 v34, v34, v40 -; SI-NEXT: v_or_b32_e32 v32, v32, v55 -; SI-NEXT: v_or_b32_e32 v33, v33, v52 -; SI-NEXT: v_or_b32_e32 v31, v31, v48 -; SI-NEXT: v_alignbit_b32 v59, v1, v59, 16 -; SI-NEXT: v_alignbit_b32 v58, v3, v58, 16 -; SI-NEXT: v_alignbit_b32 v57, v5, v57, 16 -; SI-NEXT: v_alignbit_b32 v56, v7, v56, 16 -; SI-NEXT: v_alignbit_b32 v47, v9, v47, 16 -; SI-NEXT: v_alignbit_b32 v46, v11, v46, 16 -; SI-NEXT: v_alignbit_b32 v45, v13, v45, 16 -; SI-NEXT: v_alignbit_b32 v44, v15, v44, 16 -; SI-NEXT: v_alignbit_b32 v43, v17, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v19, v42, 16 -; SI-NEXT: v_alignbit_b32 v41, v21, v41, 16 -; SI-NEXT: v_alignbit_b32 v40, v23, v40, 16 -; SI-NEXT: v_alignbit_b32 v55, v25, v55, 16 -; SI-NEXT: v_alignbit_b32 v52, v27, v52, 16 -; SI-NEXT: v_alignbit_b32 v48, v29, v48, 16 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v44 +; SI-NEXT: v_or_b32_e32 v18, v18, v41 +; SI-NEXT: v_or_b32_e32 v20, v20, v54 +; SI-NEXT: v_or_b32_e32 v22, v22, v51 +; SI-NEXT: v_or_b32_e32 v24, v24, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v37 +; SI-NEXT: v_or_b32_e32 v28, v28, v35 +; SI-NEXT: v_alignbit_b32 v30, v1, v30, 16 +; SI-NEXT: v_alignbit_b32 v59, v3, v59, 16 +; SI-NEXT: v_alignbit_b32 v58, v5, v58, 16 +; SI-NEXT: v_alignbit_b32 v57, v7, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v47, v11, v47, 16 +; SI-NEXT: v_alignbit_b32 v46, v13, v46, 16 +; SI-NEXT: v_alignbit_b32 v45, v15, v45, 16 +; SI-NEXT: v_alignbit_b32 v44, v17, v44, 16 +; SI-NEXT: v_alignbit_b32 v41, v19, v41, 16 +; SI-NEXT: v_alignbit_b32 v54, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v51, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v48, 16 +; SI-NEXT: v_alignbit_b32 v37, v27, v37, 16 +; SI-NEXT: v_alignbit_b32 v35, v29, v35, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v30 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v30 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v30 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: v_or_b32_e32 v9, v9, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v30 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v30 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v30 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v30 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v30 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v57 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v24, v24, v30 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v30 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v12, v12, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -50477,77 +46661,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -50564,594 +46677,555 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v15 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v37, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 -; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_or_b32_e32 v56, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v30, v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v49, v4, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_or_b32_e32 v40, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_or_b32_e32 v53, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v35, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v46, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_or_b32_e32 v38, v5, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v21, v40 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_mov_b32_e32 v54, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_mov_b32_e32 v58, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_mov_b32_e32 v35, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v10, v18, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v43, v1, v2 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v42, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 -; SI-NEXT: v_or_b32_e32 v19, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v19, v27, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v50, v23, v42 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_or_b32_e32 v25, v9, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v59, v11, v16 +; SI-NEXT: v_or_b32_e32 v16, v12, v29 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_mov_b32_e32 v8, v55 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v7, v14, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v12, v8 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_mov_b32_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v1, v21, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 -; SI-NEXT: v_lshr_b64 v[50:51], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_or_b32_e32 v21, v23, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v61, v12, v13 +; SI-NEXT: v_or_b32_e32 v12, v20, v4 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 -; SI-NEXT: v_or_b32_e32 v23, v25, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v53, v37, v2 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v56, v15, v14 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_or_b32_e32 v51, v22, v2 +; SI-NEXT: v_mov_b32_e32 v14, v8 +; SI-NEXT: v_or_b32_e32 v11, v27, v58 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v15 +; SI-NEXT: v_or_b32_e32 v13, v17, v45 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_lshr_b64 v[42:43], v[42:43], 16 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v31 -; SI-NEXT: v_or_b32_e32 v63, v29, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v33, v61 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 -; SI-NEXT: v_or_b32_e32 v49, v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[59:60], v[14:15], 16 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v35, v38, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_or_b32_e32 v24, v37, v4 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v55, v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v52, v36, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v48, v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[4:5], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[8:9], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 -; SI-NEXT: v_lshr_b64 v[44:45], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v25, v23 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_or_b32_e32 v30, v36, v41 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v38, v18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v36, v36, v22 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v38, v26 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_or_b32_e32 v30, v37, v20 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v18, v26, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v33 +; SI-NEXT: v_or_b32_e32 v20, v28, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v39, v62 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[20:21], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[4:5], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[41:42], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v41, v50 -; SI-NEXT: v_mov_b32_e32 v50, v59 -; SI-NEXT: v_mov_b32_e32 v40, v19 -; SI-NEXT: v_mov_b32_e32 v38, v21 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_or_b32_e32 v18, v54, v55 +; SI-NEXT: v_lshr_b64 v[26:27], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[24:25], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v63 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v18, v30 +; SI-NEXT: v_lshr_b64 v[32:33], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v23, v50 +; SI-NEXT: v_lshr_b64 v[29:30], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v54, v51 +; SI-NEXT: v_lshr_b64 v[50:51], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v16 +; SI-NEXT: v_mov_b32_e32 v16, v31 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_mov_b32_e32 v19, v38 +; SI-NEXT: v_lshr_b64 v[37:38], v[37:38], 16 +; SI-NEXT: v_mov_b32_e32 v27, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v39, v47 +; SI-NEXT: v_lshr_b64 v[47:48], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v25, v13 +; SI-NEXT: v_mov_b32_e32 v13, v44 +; SI-NEXT: v_lshr_b64 v[43:44], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: s_branch .LBB59_6 +; SI-NEXT: .LBB59_5: +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, s8 +; SI-NEXT: v_mov_b32_e32 v55, s10 +; SI-NEXT: v_mov_b32_e32 v16, s11 +; SI-NEXT: v_mov_b32_e32 v39, s12 +; SI-NEXT: v_mov_b32_e32 v30, s40 +; SI-NEXT: v_mov_b32_e32 v36, s15 +; SI-NEXT: v_mov_b32_e32 v31, s14 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v49, s21 +; SI-NEXT: v_mov_b32_e32 v40, s23 +; SI-NEXT: v_mov_b32_e32 v53, s25 +; SI-NEXT: v_mov_b32_e32 v35, s27 +; SI-NEXT: v_mov_b32_e32 v46, s29 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v25, s28 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: v_mov_b32_e32 v34, v26 +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v5, v15 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v15, v37 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v26, s43 +; SI-NEXT: v_mov_b32_e32 v11, v28 +; SI-NEXT: v_mov_b32_e32 v28, s42 +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, s13 +; SI-NEXT: v_mov_b32_e32 v32, s9 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v47, s6 +; SI-NEXT: v_mov_b32_e32 v9, v24 +; SI-NEXT: .LBB59_6: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v48, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_or_b32_e32 v38, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v30, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v37, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_or_b32_e32 v39, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v61 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_or_b32_e32 v13, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_mov_b32_e32 v15, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_mov_b32_e32 v7, v30 +; SI-NEXT: v_mov_b32_e32 v9, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_mov_b32_e32 v5, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_mov_b32_e32 v11, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v28, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -51168,24 +47242,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v3, v48 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index ccc46cc5df39e..d44ffdfbc547c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2611,58 +2611,35 @@ define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6f16: @@ -2734,50 +2711,35 @@ define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v3i32_to_v6f16_scalar: @@ -2859,21 +2821,12 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2886,35 +2839,38 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -3003,61 +2959,56 @@ define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v3i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: s_cbranch_scc0 .LBB15_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_cbranch_execnz .LBB15_4 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: .LBB15_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 ; SI-NEXT: s_branch .LBB15_2 +; SI-NEXT: .LBB15_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3i32_scalar: ; VI: ; %bb.0: @@ -5905,58 +5856,35 @@ define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16: @@ -6025,53 +5953,43 @@ define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v5, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16_scalar: ; VI: ; %bb.0: @@ -6156,21 +6074,12 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6183,35 +6092,38 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6300,61 +6212,56 @@ define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 ; SI-LABEL: bitcast_v6f16_to_v3f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3f32_scalar: ; VI: ; %bb.0: @@ -8829,101 +8736,111 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v12, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v11, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6f16: @@ -9274,86 +9191,98 @@ define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 i ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_cbranch_execnz .LBB41_3 -; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s12, s6, s5 ; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s6, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s10, s5, s8 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s9, s7, s6 +; SI-NEXT: s_or_b32 s13, s5, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_mov_b32 s5, s13 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_lshl_b32 s10, s17, 8 -; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s10, s6, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s11, s10, 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s10, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v12i8_to_v6f16_scalar: @@ -9626,21 +9555,12 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -9662,12 +9582,15 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v4, v13, v1 -; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v8, v7, v8 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -9949,61 +9872,52 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s20, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_or_b32_e32 v12, v15, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v13, v14, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s20, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_or_b32 s11, s9, s11 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s9, s11, 8 +; SI-NEXT: s_bfe_u32 s19, s15, 0x80008 +; SI-NEXT: s_bfe_u32 s21, s14, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v12, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v12, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v13, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 @@ -10013,22 +9927,36 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_branch .LBB43_5 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v13, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v12 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v13 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; VI: ; %bb.0: @@ -11651,84 +11579,72 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_alignbit_b32 v4, v1, v7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v3, v2, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v4, v1, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12027,73 +11943,60 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_lshr_b64 v[7:8], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshr_b64 v[7:8], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_lshr_b64 v[8:9], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v6bf16_to_v6f16_scalar: @@ -12431,85 +12334,73 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6bf16: @@ -12587,78 +12478,74 @@ define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i ; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: s_cbranch_scc0 .LBB51_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_lshl_b32 s10, s6, 16 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_lshl_b32 s12, s7, 16 +; SI-NEXT: s_lshl_b32 s13, s18, 16 +; SI-NEXT: s_lshl_b32 s14, s8, 16 +; SI-NEXT: s_cbranch_execnz .LBB51_4 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_branch .LBB51_5 +; SI-NEXT: .LBB51_3: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB51_2 +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: .LBB51_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v6f16_to_v6bf16_scalar: ; VI: ; %bb.0: @@ -13769,62 +13656,50 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_alignbit_b32 v6, v2, v3, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v3, 16 ; SI-NEXT: .LBB56_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13903,64 +13778,60 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v6i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: s_cbranch_scc0 .LBB57_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: s_cbranch_execnz .LBB57_4 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v3, v8, v7 -; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: s_branch .LBB57_5 +; SI-NEXT: .LBB57_3: +; SI-NEXT: s_branch .LBB57_2 +; SI-NEXT: .LBB57_4: +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s18 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB57_4: -; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v6f16_to_v6i16_scalar: ; VI: ; %bb.0: @@ -14059,62 +13930,63 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v0, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v7, v6, v10, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v7, v6, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6f16: @@ -14191,53 +14063,61 @@ define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v6i16_to_v6f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s12, s5, s6 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s15, s5, s6 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 16 +; SI-NEXT: s_mov_b32 s5, s15 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s12, s6, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s12, 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v6i16_to_v6f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..f52a33c7c0f8d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -11918,20 +11918,18 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, half %val) { ; GFX7LESS-LABEL: uniform_fadd_f16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, s6 ; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 ; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 ; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 ; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX7LESS-NEXT: s_not_b32 s2, s2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 @@ -13197,60 +13195,52 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, <2 x half> %val) { ; GFX7LESS-LABEL: uniform_fadd_v2f16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s4, s[4:5], 0xd ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_lshr_b32 s4, s6, 16 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, s6 ; GFX7LESS-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s4 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s5 ; GFX7LESS-NEXT: s_lshr_b32 s4, s5, 16 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 ; GFX7LESS-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7LESS-NEXT: v_add_f32_e32 v4, v4, v1 -; GFX7LESS-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v2, v3 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7LESS-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7LESS-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7LESS-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, v4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 2761cba5ea71b..02d7b50e23b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -20,20 +20,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: s_cbranch_vccnz .LBB0_2 -; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_branch .LBB0_3 -; SI-NEXT: .LBB0_2: ; %two -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: .LBB0_3: ; %one ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 +; SI-NEXT: s_cbranch_vccnz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %one ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm +; SI-NEXT: .LBB0_2: ; %two +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry @@ -145,20 +143,15 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0.5, v1 ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm ; SI-NEXT: .LBB1_2: ; %two -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -249,19 +242,16 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0.5, v1 ; SI-NEXT: s_cbranch_vccnz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB2_2: ; %two -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index d8ef44361c40d..27308e82a3354 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -3672,14 +3672,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3712,14 +3710,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4100,14 +4096,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4139,14 +4133,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4832,12 +4824,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4848,28 +4839,27 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4881,33 +4871,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4918,29 +4907,27 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4952,21 +4939,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -7039,50 +7026,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7091,53 +7068,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7276,42 +7242,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7324,49 +7281,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7703,9 +7650,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7716,38 +7661,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_add_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_add_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7759,34 +7699,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7798,36 +7733,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7847,21 +7778,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8034,50 +7963,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -8086,53 +8005,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -8301,42 +8209,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8349,49 +8248,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -8564,50 +8453,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8616,53 +8495,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -8831,42 +8699,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8879,49 +8738,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index fc3ed6d332211..5b5fb8f3a1663 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -2773,14 +2773,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2813,14 +2811,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3222,14 +3218,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3261,14 +3255,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3976,12 +3968,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3992,28 +3983,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_max_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4025,33 +4015,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4062,29 +4051,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_max_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4096,21 +4083,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -6259,50 +6246,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6311,53 +6288,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6575,42 +6541,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6623,49 +6580,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7111,9 +7058,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7124,38 +7069,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7167,34 +7107,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -7206,36 +7141,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7255,21 +7186,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 8f270f9a466e2..c1c512b9c0a18 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -2773,14 +2773,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2813,14 +2811,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3222,14 +3218,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3261,14 +3255,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3976,12 +3968,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3992,28 +3983,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_min_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4025,33 +4015,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4062,29 +4051,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_min_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4096,21 +4083,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -6259,50 +6246,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6311,53 +6288,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6575,42 +6541,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6623,49 +6580,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7111,9 +7058,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7124,38 +7069,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_min_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7167,34 +7107,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -7206,36 +7141,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7255,21 +7186,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 689f9d7d59550..95f7744d94882 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -786,20 +786,20 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; SI: ; %bb.0: ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: ; return to shader part epilog @@ -837,22 +837,22 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; SI: ; %bb.0: ; SI-NEXT: s_lshr_b32 s2, s0, 16 ; SI-NEXT: s_lshr_b32 s3, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s3 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v2 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index b5e0d3aeace32..638e4b01488a0 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -450,7 +450,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -533,7 +539,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -839,9 +851,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -918,9 +942,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -998,16 +1034,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1087,12 +1131,23 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1176,9 +1231,20 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e64 v3, -v3, -v3 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1261,9 +1327,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1343,9 +1421,17 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1509,11 +1595,18 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; SI-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: v_cvt_f16_f32_e32 v3, 0 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v1, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -1607,9 +1700,17 @@ define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v1, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1933,7 +2034,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm_minimumnum_maximumnum(ptr ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -2016,7 +2123,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals_minimumnum_maximumnu ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -2173,9 +2286,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_minimumnum_maximumnum(pt ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 711e2f2951fae..0d3567faaa10c 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -554,7 +554,11 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -662,7 +666,11 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -771,7 +779,11 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -2980,9 +2992,17 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3066,14 +3086,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3167,9 +3191,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3268,10 +3298,17 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 +; GFX6-NEXT: v_min_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3369,9 +3406,17 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3460,9 +3505,17 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3555,10 +3608,17 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3648,9 +3708,17 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3739,13 +3807,21 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3827,16 +3903,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 +; GFX6-NEXT: v_min_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3925,14 +4005,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -4352,7 +4436,11 @@ define half @v_clamp_f16_minimumnum_maximumnum(half %a) #1 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4408,7 +4496,11 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee(half %a) #5 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4466,7 +4558,13 @@ define half @v_clamp_f16_minimumnum_maximumnum_foldable_source(half %a, half %b) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4525,7 +4623,13 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source(half %a, ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index c48efc925ea8b..11edef2929d7d 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1108,13 +1108,13 @@ define amdgpu_vs <3 x half> @load_v3i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s2, 16 ; GFX67-NEXT: s_lshr_b32 s5, s0, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s5, s5, s4 +; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s1, s1, s3 +; GFX67-NEXT: s_lshl_b32 s3, s5, 16 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff -; GFX67-NEXT: s_lshl_b32 s2, s5, 16 -; GFX67-NEXT: s_or_b32 s0, s0, s2 ; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s3 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog @@ -1170,19 +1170,19 @@ define amdgpu_vs <4 x half> @load_v4i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: s_lshr_b32 s6, s2, 16 ; GFX67-NEXT: s_lshr_b32 s5, s1, 16 +; GFX67-NEXT: s_lshr_b32 s6, s2, 16 ; GFX67-NEXT: s_lshr_b32 s7, s3, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s2 -; GFX67-NEXT: s_add_i32 s4, s4, s6 -; GFX67-NEXT: s_add_i32 s1, s1, s3 ; GFX67-NEXT: s_add_i32 s5, s5, s7 +; GFX67-NEXT: s_add_i32 s1, s1, s3 +; GFX67-NEXT: s_add_i32 s4, s4, s6 +; GFX67-NEXT: s_add_i32 s0, s0, s2 +; GFX67-NEXT: s_lshl_b32 s5, s5, 16 +; GFX67-NEXT: s_lshl_b32 s3, s4, 16 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff -; GFX67-NEXT: s_lshl_b32 s2, s4, 16 ; GFX67-NEXT: s_and_b32 s1, s1, 0xffff -; GFX67-NEXT: s_or_b32 s0, s0, s2 -; GFX67-NEXT: s_lshl_b32 s2, s5, 16 -; GFX67-NEXT: s_or_b32 s1, s1, s2 +; GFX67-NEXT: s_or_b32 s0, s0, s3 +; GFX67-NEXT: s_or_b32 s1, s1, s5 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog @@ -1246,23 +1246,23 @@ define amdgpu_vs <6 x half> @load_v6i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 ; GFX67-NEXT: s_lshr_b32 s9, s4, 16 ; GFX67-NEXT: s_lshr_b32 s7, s1, 16 -; GFX67-NEXT: s_lshr_b32 s10, s5, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: s_add_i32 s3, s3, s9 ; GFX67-NEXT: s_lshr_b32 s8, s2, 16 +; GFX67-NEXT: s_lshr_b32 s10, s5, 16 ; GFX67-NEXT: s_lshr_b32 s11, s6, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s5 +; GFX67-NEXT: s_add_i32 s3, s3, s9 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s8, s8, s11 +; GFX67-NEXT: s_add_i32 s2, s2, s6 ; GFX67-NEXT: s_add_i32 s7, s7, s10 +; GFX67-NEXT: s_add_i32 s1, s1, s5 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s3, s3, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s8, s8, s11 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshl_b32 s6, s7, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s3 -; GFX67-NEXT: s_lshl_b32 s3, s7, 16 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s3 ; GFX67-NEXT: s_lshl_b32 s3, s8, 16 +; GFX67-NEXT: s_or_b32 s1, s1, s6 ; GFX67-NEXT: s_or_b32 s2, s2, s3 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 @@ -1338,31 +1338,31 @@ define amdgpu_vs <8 x half> @load_v8i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshr_b32 s8, s0, 16 ; GFX67-NEXT: s_lshr_b32 s12, s4, 16 ; GFX67-NEXT: s_lshr_b32 s9, s1, 16 -; GFX67-NEXT: s_lshr_b32 s13, s5, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: s_add_i32 s8, s8, s12 ; GFX67-NEXT: s_lshr_b32 s10, s2, 16 +; GFX67-NEXT: s_lshr_b32 s11, s3, 16 +; GFX67-NEXT: s_lshr_b32 s13, s5, 16 ; GFX67-NEXT: s_lshr_b32 s14, s6, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s5 +; GFX67-NEXT: s_lshr_b32 s15, s7, 16 +; GFX67-NEXT: s_add_i32 s8, s8, s12 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_add_i32 s3, s3, s7 +; GFX67-NEXT: s_add_i32 s10, s10, s14 +; GFX67-NEXT: s_add_i32 s2, s2, s6 ; GFX67-NEXT: s_add_i32 s9, s9, s13 +; GFX67-NEXT: s_add_i32 s1, s1, s5 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s4, s8, 16 -; GFX67-NEXT: s_lshr_b32 s11, s3, 16 -; GFX67-NEXT: s_lshr_b32 s15, s7, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s10, s10, s14 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshl_b32 s11, s11, 16 +; GFX67-NEXT: s_lshl_b32 s6, s9, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s4 -; GFX67-NEXT: s_lshl_b32 s4, s9, 16 -; GFX67-NEXT: s_add_i32 s3, s3, s7 -; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s4 ; GFX67-NEXT: s_lshl_b32 s4, s10, 16 ; GFX67-NEXT: s_and_b32 s3, s3, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s6 ; GFX67-NEXT: s_or_b32 s2, s2, s4 -; GFX67-NEXT: s_lshl_b32 s4, s11, 16 -; GFX67-NEXT: s_or_b32 s3, s3, s4 +; GFX67-NEXT: s_or_b32 s3, s3, s11 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: v_mov_b32_e32 v2, s2 @@ -1447,60 +1447,60 @@ define amdgpu_vs <16 x half> @load_v16i16(ptr addrspace(6) inreg %p0, ptr addrsp ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s16, s0, 16 ; GFX67-NEXT: s_lshr_b32 s24, s8, 16 -; GFX67-NEXT: s_lshr_b32 s17, s1, 16 -; GFX67-NEXT: s_lshr_b32 s25, s9, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s8 -; GFX67-NEXT: s_add_i32 s16, s16, s24 ; GFX67-NEXT: s_lshr_b32 s18, s2, 16 ; GFX67-NEXT: s_lshr_b32 s26, s10, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s9 -; GFX67-NEXT: s_add_i32 s17, s17, s25 +; GFX67-NEXT: s_add_i32 s16, s16, s24 +; GFX67-NEXT: s_add_i32 s0, s0, s8 +; GFX67-NEXT: s_lshr_b32 s20, s4, 16 +; GFX67-NEXT: s_lshr_b32 s28, s12, 16 +; GFX67-NEXT: s_add_i32 s18, s18, s26 +; GFX67-NEXT: s_add_i32 s2, s2, s10 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s8, s16, 16 +; GFX67-NEXT: s_lshr_b32 s17, s1, 16 ; GFX67-NEXT: s_lshr_b32 s19, s3, 16 +; GFX67-NEXT: s_lshr_b32 s21, s5, 16 +; GFX67-NEXT: s_lshr_b32 s22, s6, 16 +; GFX67-NEXT: s_lshr_b32 s23, s7, 16 +; GFX67-NEXT: s_lshr_b32 s25, s9, 16 ; GFX67-NEXT: s_lshr_b32 s27, s11, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s10 -; GFX67-NEXT: s_add_i32 s18, s18, s26 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshr_b32 s29, s13, 16 +; GFX67-NEXT: s_lshr_b32 s30, s14, 16 +; GFX67-NEXT: s_lshr_b32 s31, s15, 16 +; GFX67-NEXT: s_add_i32 s20, s20, s28 +; GFX67-NEXT: s_add_i32 s4, s4, s12 ; GFX67-NEXT: s_or_b32 s0, s0, s8 -; GFX67-NEXT: s_lshl_b32 s8, s17, 16 -; GFX67-NEXT: s_lshr_b32 s20, s4, 16 -; GFX67-NEXT: s_lshr_b32 s28, s12, 16 -; GFX67-NEXT: s_add_i32 s3, s3, s11 -; GFX67-NEXT: s_add_i32 s19, s19, s27 ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s8 ; GFX67-NEXT: s_lshl_b32 s8, s18, 16 -; GFX67-NEXT: s_lshr_b32 s21, s5, 16 -; GFX67-NEXT: s_lshr_b32 s29, s13, 16 -; GFX67-NEXT: s_add_i32 s4, s4, s12 -; GFX67-NEXT: s_add_i32 s20, s20, s28 -; GFX67-NEXT: s_and_b32 s3, s3, 0xffff -; GFX67-NEXT: s_or_b32 s2, s2, s8 -; GFX67-NEXT: s_lshl_b32 s8, s19, 16 -; GFX67-NEXT: s_lshr_b32 s22, s6, 16 -; GFX67-NEXT: s_lshr_b32 s30, s14, 16 -; GFX67-NEXT: s_add_i32 s5, s5, s13 +; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_add_i32 s7, s7, s15 +; GFX67-NEXT: s_add_i32 s22, s22, s30 +; GFX67-NEXT: s_add_i32 s6, s6, s14 ; GFX67-NEXT: s_add_i32 s21, s21, s29 +; GFX67-NEXT: s_add_i32 s5, s5, s13 +; GFX67-NEXT: s_add_i32 s19, s19, s27 +; GFX67-NEXT: s_add_i32 s3, s3, s11 +; GFX67-NEXT: s_add_i32 s17, s17, s25 +; GFX67-NEXT: s_add_i32 s1, s1, s9 +; GFX67-NEXT: s_or_b32 s2, s2, s8 ; GFX67-NEXT: s_and_b32 s4, s4, 0xffff -; GFX67-NEXT: s_or_b32 s3, s3, s8 ; GFX67-NEXT: s_lshl_b32 s8, s20, 16 -; GFX67-NEXT: s_lshr_b32 s23, s7, 16 -; GFX67-NEXT: s_lshr_b32 s31, s15, 16 -; GFX67-NEXT: s_add_i32 s6, s6, s14 -; GFX67-NEXT: s_add_i32 s22, s22, s30 -; GFX67-NEXT: s_and_b32 s5, s5, 0xffff +; GFX67-NEXT: s_lshl_b32 s23, s23, 16 +; GFX67-NEXT: s_lshl_b32 s14, s21, 16 +; GFX67-NEXT: s_lshl_b32 s12, s19, 16 +; GFX67-NEXT: s_lshl_b32 s10, s17, 16 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_and_b32 s3, s3, 0xffff ; GFX67-NEXT: s_or_b32 s4, s4, s8 -; GFX67-NEXT: s_lshl_b32 s8, s21, 16 -; GFX67-NEXT: s_add_i32 s7, s7, s15 -; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_and_b32 s5, s5, 0xffff ; GFX67-NEXT: s_and_b32 s6, s6, 0xffff -; GFX67-NEXT: s_or_b32 s5, s5, s8 ; GFX67-NEXT: s_lshl_b32 s8, s22, 16 ; GFX67-NEXT: s_and_b32 s7, s7, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s10 +; GFX67-NEXT: s_or_b32 s3, s3, s12 +; GFX67-NEXT: s_or_b32 s5, s5, s14 ; GFX67-NEXT: s_or_b32 s6, s6, s8 -; GFX67-NEXT: s_lshl_b32 s8, s23, 16 -; GFX67-NEXT: s_or_b32 s7, s7, s8 +; GFX67-NEXT: s_or_b32 s7, s7, s23 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: v_mov_b32_e32 v2, s2 @@ -1820,11 +1820,11 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s1, s2, 16 ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX67-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -1871,28 +1871,28 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <3 x half> @load_v3f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v3f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s3, 0 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 ; GFX67-NEXT: s_mov_b32 s1, s3 ; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 +; GFX67-NEXT: s_lshr_b32 s2, s4, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s0, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1 +; GFX67-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_add_f32_e32 v1, v4, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3f16: @@ -1941,33 +1941,33 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s2, s1 ; GFX67-NEXT: s_mov_b32 s1, s3 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: s_lshr_b32 s1, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s3 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX67-NEXT: s_lshr_b32 s4, s3, 16 +; GFX67-NEXT: s_lshr_b32 s5, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GFX67-NEXT: s_lshr_b32 s4, s2, 16 +; GFX67-NEXT: s_lshr_b32 s5, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX67-NEXT: v_add_f32_e32 v2, v4, v3 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v3, v6, v5 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4f16: @@ -2019,48 +2019,48 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <6 x half> @load_v6f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v6f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s5, 0 -; GFX67-NEXT: s_mov_b32 s4, s1 -; GFX67-NEXT: s_mov_b32 s1, s5 +; GFX67-NEXT: s_mov_b32 s3, 0 +; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s1, s3 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s4 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s2, 16 +; GFX67-NEXT: s_lshr_b32 s3, s6, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX67-NEXT: s_lshr_b32 s2, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX67-NEXT: s_lshr_b32 s1, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: s_lshr_b32 s1, s4, 16 +; GFX67-NEXT: s_lshr_b32 s0, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v6, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s6 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX67-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX67-NEXT: v_add_f32_e32 v4, v8, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX67-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6f16: @@ -2126,57 +2126,56 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s4, s1 ; GFX67-NEXT: s_mov_b32 s1, s5 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s6, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s6 ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 +; GFX67-NEXT: s_lshr_b32 s11, s3, 16 +; GFX67-NEXT: s_lshr_b32 s12, s7, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s7 +; GFX67-NEXT: s_lshr_b32 s10, s2, 16 +; GFX67-NEXT: s_lshr_b32 s13, s6, 16 +; GFX67-NEXT: s_lshr_b32 s9, s1, 16 +; GFX67-NEXT: s_lshr_b32 s12, s5, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s13 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s10 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s9 +; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: s_lshr_b32 s11, s4, 16 +; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v8 -; GFX67-NEXT: s_lshr_b32 s1, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s0 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v9 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s8 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v8, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s7 +; GFX67-NEXT: v_add_f32_e32 v4, v10, v9 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v15 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v13 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v14 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v12 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX67-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX67-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8f16: @@ -2248,109 +2247,108 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <16 x half> @load_v16f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v16f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s3, 0 -; GFX67-NEXT: s_mov_b32 s2, s1 -; GFX67-NEXT: s_mov_b32 s1, s3 -; GFX67-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 -; GFX67-NEXT: s_lshr_b32 s0, s8, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: s_lshr_b32 s0, s9, 16 -; GFX67-NEXT: s_lshr_b32 s1, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: s_lshr_b32 s0, s10, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s0 -; GFX67-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x10 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10 -; GFX67-NEXT: s_lshr_b32 s8, s11, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s8 +; GFX67-NEXT: s_mov_b32 s9, 0 +; GFX67-NEXT: s_mov_b32 s8, s1 +; GFX67-NEXT: s_mov_b32 s1, s9 +; GFX67-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 +; GFX67-NEXT: s_load_dwordx8 s[8:15], s[8:9], 0x10 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX67-NEXT: s_lshr_b32 s7, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11 -; GFX67-NEXT: s_lshr_b32 s12, s5, 16 -; GFX67-NEXT: v_add_f32_e32 v13, v13, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s12 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX67-NEXT: s_lshr_b32 s13, s6, 16 -; GFX67-NEXT: v_add_f32_e32 v14, v14, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s5 -; GFX67-NEXT: v_add_f32_e32 v15, v15, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s13 -; GFX67-NEXT: s_lshr_b32 s11, s4, 16 -; GFX67-NEXT: v_add_f32_e32 v10, v10, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s3 -; GFX67-NEXT: v_add_f32_e32 v11, v11, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s11 -; GFX67-NEXT: v_add_f32_e32 v12, v12, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4 -; GFX67-NEXT: s_lshr_b32 s9, s2, 16 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s9 -; GFX67-NEXT: s_lshr_b32 s10, s3, 16 -; GFX67-NEXT: v_add_f32_e32 v8, v8, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s2 -; GFX67-NEXT: v_add_f32_e32 v9, v9, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s10 -; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: s_lshr_b32 s23, s7, 16 +; GFX67-NEXT: s_lshr_b32 s28, s15, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s28 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s23 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s14 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s15 +; GFX67-NEXT: s_lshr_b32 s22, s6, 16 +; GFX67-NEXT: s_lshr_b32 s29, s14, 16 +; GFX67-NEXT: s_lshr_b32 s21, s5, 16 +; GFX67-NEXT: s_lshr_b32 s28, s13, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s29 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s22 +; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s28 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s21 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s13 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: s_lshr_b32 s20, s4, 16 +; GFX67-NEXT: s_lshr_b32 s23, s12, 16 +; GFX67-NEXT: s_lshr_b32 s19, s3, 16 +; GFX67-NEXT: s_lshr_b32 s27, s11, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s23 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s20 +; GFX67-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s27 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s19 +; GFX67-NEXT: v_cvt_f16_f32_e32 v11, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v12, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s10 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: s_lshr_b32 s18, s2, 16 +; GFX67-NEXT: s_lshr_b32 s26, s10, 16 +; GFX67-NEXT: s_lshr_b32 s17, s1, 16 +; GFX67-NEXT: s_lshr_b32 s25, s9, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s26 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s18 +; GFX67-NEXT: v_cvt_f16_f32_e32 v14, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s25 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s17 +; GFX67-NEXT: s_lshr_b32 s16, s0, 16 +; GFX67-NEXT: s_lshr_b32 s24, s8, 16 +; GFX67-NEXT: v_cvt_f16_f32_e32 v15, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s9 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s8 ; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s8 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s1 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v19 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s24 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s16 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v16, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v4, v18, v17 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v9 -; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v11 -; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v12 -; GFX67-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v13 -; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v15 -; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX67-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX67-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX67-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX67-NEXT: v_or_b32_e32 v3, v14, v13 +; GFX67-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX67-NEXT: v_or_b32_e32 v5, v10, v5 +; GFX67-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX67-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 21abcbd4f5edc..76583e806b805 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1737,14 +1737,18 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test3: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v7, 0x4000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1899,14 +1903,18 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test4: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v7, 0x3800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2129,11 +2137,12 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test6: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x4200 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2238,10 +2247,12 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test7: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc400 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x4800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2346,10 +2357,11 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test8: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x8000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2428,7 +2440,6 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test9: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index d9b23d43d593d..305ce4e6d4c45 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -654,8 +654,8 @@ define float @divergent_vec_f16_LL(half %a, half %b) { ; GCN-LABEL: divergent_vec_f16_LL: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 78a00dd51c2b2..d879ebede164e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -398,9 +398,9 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -414,26 +414,23 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB2_3 ; SI-NEXT: .LBB2_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -443,40 +440,32 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB2_3: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 +; SI-NEXT: v_mov_b32_e32 v5, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v6, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB2_4: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB2_2 ; @@ -1083,9 +1072,9 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1115,26 +1104,23 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1160,40 +1146,32 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB5_3: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 +; SI-NEXT: v_mov_b32_e32 v5, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v6, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB5_2 ; @@ -1739,21 +1717,21 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1771,138 +1749,104 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_or_b32_e32 v11, v8, v14 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB8_3 ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB8_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB8_4: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v9, 0x3900 +; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v11, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v2 -; SI-NEXT: v_or_b32_e32 v2, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v8, v4 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; SI-NEXT: v_or_b32_e32 v0, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_or_b32_e32 v3, v12, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xf16_extract_8xf16_0: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 614200803d6f1..edae2c393e5f0 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -557,39 +557,30 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_cmp_eq_u32 s8, 1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; SI-NEXT: buffer_store_short v0, v[6:7], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -730,87 +721,71 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index e9014e212b76f..dfd1fa6020eab 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -673,17 +673,17 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0 -; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: flat_store_short v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: v_extract_fabs_fold_v2f16: @@ -788,7 +788,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; CI-NEXT: flat_store_short v[0:1], v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v1 diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index f45070cbe88ee..6eec710a4c24e 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -756,6 +756,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -781,6 +783,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FASTFMA-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -807,6 +811,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-SLOWFMA-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -846,6 +852,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -871,6 +879,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FASTFMA-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -897,6 +907,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-SLOWFMA-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index e57f0b6f33439..0b7533e2ecced 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -464,19 +464,19 @@ define amdgpu_kernel void @fadd_v2f16( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, v2, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -634,15 +634,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -773,15 +773,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index a723a67498d05..805b1421f94d0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2487,19 +2487,19 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v4f16: @@ -2615,7 +2615,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2912,7 +2912,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2964,7 +2964,7 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v2, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 @@ -3023,7 +3023,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v3, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -3085,26 +3085,26 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3150,33 +3150,33 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; CI-NEXT: v_or_b32_e32 v1, v1, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v0, v0, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3229,47 +3229,47 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v0, v0, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v2, v2, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_or_b32_e32 v3, v3, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_or_b32_e32 v0, v0, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v2, v2, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: v_or_b32_e32 v3, v3, v10 +; CI-NEXT: v_or_b32_e32 v4, v4, v7 ; CI-NEXT: v_or_b32_e32 v5, v5, v6 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3334,61 +3334,61 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; CI-NEXT: v_or_b32_e32 v0, v0, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v1, v1, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; CI-NEXT: v_or_b32_e32 v2, v2, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v3, v3, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; CI-NEXT: v_or_b32_e32 v4, v4, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v5, v5, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; CI-NEXT: v_or_b32_e32 v1, v1, v9 +; CI-NEXT: v_or_b32_e32 v3, v3, v12 +; CI-NEXT: v_or_b32_e32 v5, v5, v13 ; CI-NEXT: v_or_b32_e32 v6, v6, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v7, v7, v8 +; CI-NEXT: v_or_b32_e32 v7, v7, v11 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3486,117 +3486,117 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_or_b32_e32 v0, v0, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v1, v1, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v2, v2, v18 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v3, v3, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_or_b32_e32 v3, v3, v19 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_or_b32_e32 v4, v4, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v5, v5, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v6, v6, v18 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v7, v7, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_or_b32_e32 v7, v7, v19 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_or_b32_e32 v8, v8, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v9, v9, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v10, v10, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; CI-NEXT: v_or_b32_e32 v11, v11, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_or_b32_e32 v11, v11, v18 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_or_b32_e32 v12, v12, v16 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; CI-NEXT: v_or_b32_e32 v13, v13, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_or_b32_e32 v12, v12, v16 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; CI-NEXT: v_or_b32_e32 v13, v13, v17 ; CI-NEXT: v_or_b32_e32 v14, v14, v16 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; CI-NEXT: v_or_b32_e32 v15, v15, v16 +; CI-NEXT: v_or_b32_e32 v15, v15, v18 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 17cc51d08a1e2..5d3f69c84b902 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -2455,26 +2455,26 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -2650,26 +2650,26 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -2844,26 +2844,26 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3038,26 +3038,26 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3233,26 +3233,26 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3428,26 +3428,26 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3623,26 +3623,26 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3818,26 +3818,26 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4012,26 +4012,26 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4206,26 +4206,26 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4401,26 +4401,26 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4595,26 +4595,26 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4789,26 +4789,26 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4983,26 +4983,26 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index a8703d5d6e51d..3f6750546618f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -741,21 +741,15 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: s_movk_i32 s4, 0x7fff ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_f16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_f16_bf16: @@ -791,22 +785,16 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s1, s1, 0x8000 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_f16_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX7-NEXT: s_brev_b32 s0, -2 -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_f16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b80204e70851e..23753bc5970dd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -16,12 +16,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s1, 0xffff8000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_f16: @@ -141,8 +138,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, s0, 0x7fff -; SI-NEXT: s_bitset1_b32 s0, 15 +; SI-NEXT: s_or_b32 s0, s0, 0xffff8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg1: @@ -167,8 +163,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, s0, 0x7fff -; SI-NEXT: s_bitset1_b32 s0, 15 +; SI-NEXT: s_or_b32 s0, s0, 0xffff8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg10: @@ -348,11 +343,8 @@ define half @v_copysign_f16(half %mag, half %sign) { ; SI-LABEL: v_copysign_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16: @@ -484,8 +476,7 @@ define half @v_test_copysign_f16_neg1(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg1: @@ -519,8 +510,7 @@ define half @v_test_copysign_f16_neg10(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg10: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg10: @@ -739,10 +729,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: @@ -777,10 +767,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: @@ -816,11 +806,8 @@ define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: @@ -869,48 +856,41 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 -; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v5, v5, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; SI-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 ; SI-NEXT: s_movk_i32 s4, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v0, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-NEXT: v_and_b32_e32 v5, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 ; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f64_sign_f16: @@ -1105,53 +1085,47 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_and_b32 s2, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s2, s0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_and_b32 s3, s1, 0x1ff +; SI-NEXT: s_or_b32 s0, s3, s0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 -; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014 +; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014 ; SI-NEXT: s_and_b32 s0, s0, 0xffe -; SI-NEXT: v_readfirstlane_b32 s2, v1 -; SI-NEXT: s_sub_i32 s4, 0x3f1, s3 -; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: v_med3_i32 v1, s4, 0, 13 -; SI-NEXT: s_or_b32 s2, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_lshr_b32 s5, s2, s4 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_sub_i32 s4, 0x3f1, s1 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: v_med3_i32 v0, s4, 0, 13 +; SI-NEXT: s_or_b32 s3, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_lshr_b32 s5, s3, s4 ; SI-NEXT: s_lshl_b32 s4, s5, s4 -; SI-NEXT: s_cmp_lg_u32 s4, s2 -; SI-NEXT: s_cselect_b32 s2, 1, 0 -; SI-NEXT: s_addk_i32 s3, 0xfc10 -; SI-NEXT: s_lshl_b32 s4, s3, 12 -; SI-NEXT: s_or_b32 s2, s5, s2 +; SI-NEXT: s_cmp_lg_u32 s4, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s1, 0xfc10 +; SI-NEXT: s_lshl_b32 s4, s1, 12 +; SI-NEXT: s_or_b32 s3, s5, s3 ; SI-NEXT: s_or_b32 s4, s0, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 1 -; SI-NEXT: s_cselect_b32 s2, s2, s4 -; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_cmp_lt_i32 s1, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s3, 7 ; SI-NEXT: s_cmp_gt_i32 s4, 5 ; SI-NEXT: s_cselect_b32 s5, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s4, 3 ; SI-NEXT: s_cselect_b32 s4, 1, 0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshr_b32 s2, s2, 2 -; SI-NEXT: s_add_i32 s2, s2, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 31 -; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: s_cmp_lt_i32 s1, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s0, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s2 -; SI-NEXT: s_lshr_b32 s1, s1, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s3 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_and_b32 s1, s2, 0xffff8000 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: @@ -1363,19 +1337,14 @@ define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inr ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s3, s0, 0x7fff +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16: @@ -1411,25 +1380,18 @@ define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inr define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: s_and_b32 s4, s2, 0x8000 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_and_b32 s5, s0, 0x7fff +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_and_b32 s1, s1, 0x7fff +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s0, s4, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v3f16: @@ -1476,33 +1438,24 @@ define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x hal define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s5, s3, 16 +; SI-NEXT: s_and_b32 s6, s1, 0x7fff +; SI-NEXT: s_and_b32 s5, s5, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s5 +; SI-NEXT: s_or_b32 s3, s6, s3 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_and_b32 s3, s0, 0x7fff +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_and_b32 s3, s4, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s2, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v4f16: @@ -1549,59 +1502,42 @@ define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x hal define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v8f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_and_b32 s12, s3, 0x7fff +; SI-NEXT: s_and_b32 s11, s11, 0x8000 +; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; SI-NEXT: s_and_b32 s7, s7, 0x8000 +; SI-NEXT: s_or_b32 s3, s3, s11 +; SI-NEXT: s_or_b32 s7, s12, s7 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_and_b32 s7, s2, 0x7fff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s10, 0x8000 +; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; SI-NEXT: s_or_b32 s2, s2, s7 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_or_b32 s2, s6, s2 +; SI-NEXT: s_and_b32 s5, s5, 0x8000 +; SI-NEXT: s_and_b32 s6, s1, 0x7fff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s9, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_or_b32 s1, s1, s6 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 -; SI-NEXT: s_lshr_b32 s8, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s3 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v7, s0, v15, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v5, s0, v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v4 -; SI-NEXT: v_readfirstlane_b32 s3, v6 +; SI-NEXT: s_or_b32 s1, s5, s1 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_and_b32 s5, s0, 0x7fff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s8, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s5 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s4, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v8f16: @@ -1671,111 +1607,78 @@ define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x hal define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v16f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s23, s15, 16 +; SI-NEXT: s_and_b32 s24, s7, 0x7fff +; SI-NEXT: s_and_b32 s23, s23, 0x8000 +; SI-NEXT: s_bfe_u32 s7, s7, 0xf0010 +; SI-NEXT: s_and_b32 s15, s15, 0x8000 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s15, s24, s15 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshr_b32 s22, s14, 16 +; SI-NEXT: s_or_b32 s7, s15, s7 +; SI-NEXT: s_and_b32 s14, s14, 0x8000 +; SI-NEXT: s_and_b32 s15, s6, 0x7fff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s22, 0x8000 +; SI-NEXT: s_bfe_u32 s6, s6, 0xf0010 +; SI-NEXT: s_or_b32 s6, s6, s15 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshr_b32 s21, s13, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s13, s13, 0x8000 +; SI-NEXT: s_and_b32 s14, s5, 0x7fff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s21, 0x8000 +; SI-NEXT: s_bfe_u32 s5, s5, 0xf0010 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshr_b32 s20, s12, 16 +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: s_and_b32 s12, s12, 0x8000 +; SI-NEXT: s_and_b32 s13, s4, 0x7fff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s20, 0x8000 +; SI-NEXT: s_bfe_u32 s4, s4, 0xf0010 +; SI-NEXT: s_or_b32 s4, s4, s13 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s19, s11, 16 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_and_b32 s11, s11, 0x8000 +; SI-NEXT: s_and_b32 s12, s3, 0x7fff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s19, 0x8000 +; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; SI-NEXT: s_or_b32 s3, s3, s12 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_lshr_b32 s18, s10, 16 +; SI-NEXT: s_or_b32 s3, s11, s3 +; SI-NEXT: s_and_b32 s10, s10, 0x8000 +; SI-NEXT: s_and_b32 s11, s2, 0x7fff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s18, 0x8000 +; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; SI-NEXT: s_or_b32 s2, s2, s11 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s17, s9, 16 +; SI-NEXT: s_or_b32 s2, s10, s2 +; SI-NEXT: s_and_b32 s9, s9, 0x8000 +; SI-NEXT: s_and_b32 s10, s1, 0x7fff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s17, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_or_b32 s1, s1, s10 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s16, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: s_lshr_b32 s16, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_lshr_b32 s16, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_lshr_b32 s16, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s16, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: s_lshr_b32 s16, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: s_lshr_b32 s16, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_lshr_b32 s16, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: s_lshr_b32 s16, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: s_lshr_b32 s16, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: s_lshr_b32 s16, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: s_lshr_b32 s16, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: s_lshr_b32 s16, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: s_lshr_b32 s16, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_lshr_b32 s16, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v14, s0, v15, v14 -; SI-NEXT: v_bfi_b32 v18, s0, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_bfi_b32 v10, s0, v11, v10 -; SI-NEXT: v_bfi_b32 v15, s0, v19, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 -; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 -; SI-NEXT: v_bfi_b32 v18, s0, v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v11, s0, v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s11 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s3 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s2 -; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v7, s0, v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v5, s0, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v4 -; SI-NEXT: v_readfirstlane_b32 s3, v6 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: s_or_b32 s1, s9, s1 +; SI-NEXT: s_and_b32 s8, s8, 0x8000 +; SI-NEXT: s_and_b32 s9, s0, 0x7fff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s16, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s9 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s8, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v16f16: @@ -1886,18 +1789,14 @@ define <2 x half> @v_copysign_v2f16(<2 x half> %mag, <2 x half> %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16: @@ -1927,23 +1826,18 @@ define <3 x half> @v_copysign_v3f16(<3 x half> %mag, <3 x half> %sign) { ; SI-LABEL: v_copysign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v3f16: @@ -1977,30 +1871,23 @@ define <4 x half> @v_copysign_v4f16(<4 x half> %mag, <4 x half> %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v4f16: @@ -2033,55 +1920,42 @@ define <8 x half> @v_copysign_v8f16(<8 x half> %mag, <8 x half> %sign) { ; SI-LABEL: v_copysign_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v4, s4, v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v5, s4, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v6, s4, v13, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 -; SI-NEXT: v_bfi_b32 v7, s4, v15, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v8f16: @@ -2120,103 +1994,78 @@ define <16 x half> @v_copysign_v16f16(<16 x half> %mag, <16 x half> %sign) { ; SI-LABEL: v_copysign_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_bfi_b32 v15, s4, v19, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_bfi_b32 v14, s4, v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v13, s4, v19, v13 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v12, s4, v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v11, s4, v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_bfi_b32 v10, s4, v18, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_bfi_b32 v8, s4, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v9 -; SI-NEXT: v_bfi_b32 v9, s4, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v16, 0x8000, v15 +; SI-NEXT: v_and_b32_e32 v17, 0x7fff, v7 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_and_b32_e32 v17, 0x8000, v14 +; SI-NEXT: v_and_b32_e32 v18, 0x7fff, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0x8000, v13 +; SI-NEXT: v_and_b32_e32 v19, 0x7fff, v5 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_and_b32_e32 v19, 0x8000, v12 +; SI-NEXT: v_and_b32_e32 v20, 0x7fff, v4 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v20, 0x8000, v11 +; SI-NEXT: v_and_b32_e32 v21, 0x7fff, v3 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v21, 0x8000, v10 +; SI-NEXT: v_and_b32_e32 v22, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v22, 0x8000, v9 +; SI-NEXT: v_and_b32_e32 v23, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v23, 0x8000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v24, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v15, 0x8000, v15 +; SI-NEXT: v_bfe_u32 v7, v7, 16, 15 +; SI-NEXT: v_and_b32_e32 v14, 0x8000, v14 +; SI-NEXT: v_bfe_u32 v6, v6, 16, 15 +; SI-NEXT: v_and_b32_e32 v13, 0x8000, v13 +; SI-NEXT: v_bfe_u32 v5, v5, 16, 15 +; SI-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; SI-NEXT: v_bfe_u32 v4, v4, 16, 15 +; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v16f16: @@ -2267,201 +2116,178 @@ define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) { ; SI-LABEL: v_copysign_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v58, 0x8000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v17, 0x8000, v17 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v38, 0x8000, v27 +; SI-NEXT: v_and_b32_e32 v39, 0x7fff, v11 +; SI-NEXT: v_and_b32_e32 v48, 0x8000, v26 +; SI-NEXT: v_and_b32_e32 v49, 0x7fff, v10 +; SI-NEXT: v_and_b32_e32 v50, 0x8000, v25 +; SI-NEXT: v_and_b32_e32 v51, 0x7fff, v9 +; SI-NEXT: v_and_b32_e32 v40, 0x8000, v22 +; SI-NEXT: v_and_b32_e32 v41, 0x7fff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_and_b32_e32 v52, 0x8000, v24 +; SI-NEXT: v_and_b32_e32 v53, 0x7fff, v8 +; SI-NEXT: v_and_b32_e32 v54, 0x8000, v23 +; SI-NEXT: v_and_b32_e32 v55, 0x7fff, v7 +; SI-NEXT: v_and_b32_e32 v42, 0x8000, v21 +; SI-NEXT: v_and_b32_e32 v43, 0x7fff, v5 +; SI-NEXT: v_and_b32_e32 v44, 0x8000, v20 +; SI-NEXT: v_and_b32_e32 v45, 0x7fff, v4 +; SI-NEXT: v_and_b32_e32 v46, 0x8000, v19 +; SI-NEXT: v_and_b32_e32 v47, 0x7fff, v3 +; SI-NEXT: v_and_b32_e32 v56, 0x8000, v18 +; SI-NEXT: v_and_b32_e32 v57, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v38, v39, v38 +; SI-NEXT: v_or_b32_e32 v39, v49, v48 +; SI-NEXT: v_or_b32_e32 v48, v51, v50 +; SI-NEXT: v_or_b32_e32 v51, v41, v40 +; SI-NEXT: v_or_b32_e32 v40, v59, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v49, v53, v52 +; SI-NEXT: v_or_b32_e32 v50, v55, v54 +; SI-NEXT: v_or_b32_e32 v52, v43, v42 +; SI-NEXT: v_or_b32_e32 v53, v45, v44 +; SI-NEXT: v_or_b32_e32 v54, v47, v46 +; SI-NEXT: v_or_b32_e32 v55, v57, v56 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_and_b32_e32 v32, 0x8000, v30 +; SI-NEXT: v_and_b32_e32 v33, 0x7fff, v14 +; SI-NEXT: v_and_b32_e32 v34, 0x8000, v29 +; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v13 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_and_b32_e32 v33, 0x8000, v16 +; SI-NEXT: v_or_b32_e32 v34, v35, v34 +; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v36, 0x8000, v28 +; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v12 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v31, s4, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_bfi_b32 v14, s4, v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_bfi_b32 v30, s4, v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_bfi_b32 v13, s4, v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_bfi_b32 v29, s4, v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_bfi_b32 v12, s4, v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_bfi_b32 v28, s4, v32, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_bfi_b32 v11, s4, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v27, s4, v32, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_bfi_b32 v10, s4, v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_bfi_b32 v32, s4, v32, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v9, s4, v9, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_bfi_b32 v26, s4, v26, v34 -; SI-NEXT: v_bfi_b32 v15, s4, v15, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v25, s4, v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_bfi_b32 v8, s4, v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_bfi_b32 v24, s4, v33, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v23, s4, v33, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_bfi_b32 v22, s4, v33, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v21, s4, v33, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_bfi_b32 v20, s4, v33, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v19, s4, v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v18, s4, v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v17, s4, v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v36, v37, v36 +; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v15 +; SI-NEXT: v_bfe_u32 v15, v15, 16, 15 +; SI-NEXT: v_and_b32_e32 v30, 0x8000, v30 +; SI-NEXT: v_bfe_u32 v14, v14, 16, 15 +; SI-NEXT: v_and_b32_e32 v29, 0x8000, v29 +; SI-NEXT: v_bfe_u32 v13, v13, 16, 15 +; SI-NEXT: v_and_b32_e32 v28, 0x8000, v28 +; SI-NEXT: v_bfe_u32 v12, v12, 16, 15 +; SI-NEXT: v_and_b32_e32 v27, 0x8000, v27 +; SI-NEXT: v_bfe_u32 v11, v11, 16, 15 +; SI-NEXT: v_and_b32_e32 v26, 0x8000, v26 +; SI-NEXT: v_bfe_u32 v10, v10, 16, 15 +; SI-NEXT: v_and_b32_e32 v25, 0x8000, v25 +; SI-NEXT: v_bfe_u32 v9, v9, 16, 15 +; SI-NEXT: v_and_b32_e32 v24, 0x8000, v24 +; SI-NEXT: v_bfe_u32 v8, v8, 16, 15 +; SI-NEXT: v_and_b32_e32 v23, 0x8000, v23 +; SI-NEXT: v_bfe_u32 v7, v7, 16, 15 +; SI-NEXT: v_and_b32_e32 v22, 0x8000, v22 +; SI-NEXT: v_bfe_u32 v6, v6, 16, 15 +; SI-NEXT: v_and_b32_e32 v21, 0x8000, v21 +; SI-NEXT: v_bfe_u32 v5, v5, 16, 15 +; SI-NEXT: v_and_b32_e32 v20, 0x8000, v20 +; SI-NEXT: v_bfe_u32 v4, v4, 16, 15 +; SI-NEXT: v_and_b32_e32 v19, 0x8000, v19 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v18, 0x8000, v18 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v14, v14, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v22 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0x8000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0x8000, v31 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v35, v37, v35 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_or_b32_e32 v10, v16, v10 -; SI-NEXT: v_or_b32_e32 v11, v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_or_b32_e32 v12, v16, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v7, v50, v7 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_or_b32_e32 v9, v48, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 +; SI-NEXT: v_or_b32_e32 v12, v36, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_or_b32_e32 v15, v35, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v32f32: @@ -2730,12 +2556,10 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s1, 0x80000000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f32: @@ -2781,12 +2605,10 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, double inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s2, 0x80000000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f64: @@ -2833,11 +2655,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half ; SI-LABEL: s_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_and_b32 s0, s1, 0xffff8000 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, s0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -3061,18 +2881,15 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, < ; SI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3114,107 +2931,98 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, < ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %mag.trunc = fptrunc <2 x float> %mag to <2 x half> - %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign) - ret <2 x half> %out -} - -define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) { -; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 -; SI-NEXT: v_or_b32_e32 v6, 0x1000, v0 -; SI-NEXT: v_med3_i32 v8, v8, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v8, v0, v8 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_and_b32_e32 v8, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %mag.trunc = fptrunc <2 x float> %mag to <2 x half> + %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign) + ret <2 x half> %out +} + +define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) { +; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 ; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v3 ; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v11, v10, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 +; SI-NEXT: v_med3_i32 v7, v7, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6 +; SI-NEXT: s_movk_i32 s5, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v10, v2, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v3 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v7, v2, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_mov_b32_e32 v8, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v1 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v9, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3573,20 +3381,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_bfi_b32 v1, s4, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32: @@ -3636,16 +3440,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -3902,16 +3706,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s0, s2, 16 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: s_and_b32 s1, s2, 0x8000 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v1, s0, v1 +; SI-NEXT: v_or_b32_e32 v0, s1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -3968,100 +3769,90 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s6, s3, 8 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s1, 8 -; SI-NEXT: s_and_b32 s5, s5, 0xffe -; SI-NEXT: s_and_b32 s6, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s6, s0 +; SI-NEXT: s_and_b32 s8, s6, 0xffe +; SI-NEXT: s_and_b32 s6, s3, 0x1ff +; SI-NEXT: s_or_b32 s2, s6, s2 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; SI-NEXT: s_bfe_u32 s6, s1, 0xb0014 -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: s_sub_i32 s7, 0x3f1, s6 -; SI-NEXT: s_or_b32 s0, s5, s0 -; SI-NEXT: v_med3_i32 v1, s7, 0, 13 -; SI-NEXT: s_or_b32 s5, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: s_lshr_b32 s8, s5, s7 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; SI-NEXT: s_bfe_u32 s3, s3, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s2, v0 +; SI-NEXT: s_sub_i32 s7, 0x3f1, s3 +; SI-NEXT: s_or_b32 s2, s8, s2 +; SI-NEXT: v_med3_i32 v0, s7, 0, 13 +; SI-NEXT: s_or_b32 s6, s2, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_lshr_b32 s8, s6, s7 ; SI-NEXT: s_lshl_b32 s7, s8, s7 -; SI-NEXT: s_cmp_lg_u32 s7, s5 -; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_addk_i32 s6, 0xfc10 -; SI-NEXT: s_lshl_b32 s7, s6, 12 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s7, s0, s7 -; SI-NEXT: s_cmp_lt_i32 s6, 1 -; SI-NEXT: s_cselect_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s5, 7 +; SI-NEXT: s_cmp_lg_u32 s7, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_addk_i32 s3, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s3, 12 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s7, s2, s7 +; SI-NEXT: s_cmp_lt_i32 s3, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s6, 7 ; SI-NEXT: s_cmp_gt_i32 s7, 5 ; SI-NEXT: s_cselect_b32 s8, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s7, 3 ; SI-NEXT: s_cselect_b32 s7, 1, 0 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_lshr_b32 s5, s5, 2 -; SI-NEXT: s_add_i32 s5, s5, s7 -; SI-NEXT: s_cmp_lt_i32 s6, 31 -; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s7 +; SI-NEXT: s_cmp_lt_i32 s3, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_movk_i32 s7, 0x7e00 -; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s5, s1, s0 -; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s6, s0, 0xffe -; SI-NEXT: s_and_b32 s0, s3, 0x1ff -; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; SI-NEXT: s_or_b32 s0, s6, s0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s2 -; SI-NEXT: v_med3_i32 v1, s6, 0, 13 -; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: s_lshr_b32 s8, s1, s6 -; SI-NEXT: s_lshl_b32 s6, s8, s6 -; SI-NEXT: s_cmp_lg_u32 s6, s1 -; SI-NEXT: s_cselect_b32 s1, 1, 0 -; SI-NEXT: s_addk_i32 s2, 0xfc10 -; SI-NEXT: s_lshl_b32 s6, s2, 12 -; SI-NEXT: s_or_b32 s1, s8, s1 -; SI-NEXT: s_or_b32 s6, s0, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s6 -; SI-NEXT: s_and_b32 s6, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s2, s7, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f +; SI-NEXT: s_cselect_b32 s6, s2, s6 +; SI-NEXT: s_lshr_b32 s2, s1, 8 +; SI-NEXT: s_and_b32 s8, s2, 0xffe +; SI-NEXT: s_and_b32 s2, s1, 0x1ff +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_sub_i32 s3, 0x3f1, s1 +; SI-NEXT: s_or_b32 s0, s8, s0 +; SI-NEXT: v_med3_i32 v0, s3, 0, 13 +; SI-NEXT: s_or_b32 s2, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_lshr_b32 s8, s2, s3 +; SI-NEXT: s_lshl_b32 s3, s8, s3 +; SI-NEXT: s_cmp_lg_u32 s3, s2 +; SI-NEXT: s_cselect_b32 s2, 1, 0 +; SI-NEXT: s_addk_i32 s1, 0xfc10 +; SI-NEXT: s_lshl_b32 s3, s1, 12 +; SI-NEXT: s_or_b32 s2, s8, s2 +; SI-NEXT: s_or_b32 s3, s0, s3 +; SI-NEXT: s_cmp_lt_i32 s1, 1 +; SI-NEXT: s_cselect_b32 s2, s2, s3 +; SI-NEXT: s_and_b32 s3, s2, 7 +; SI-NEXT: s_cmp_gt_i32 s3, 5 ; SI-NEXT: s_cselect_b32 s8, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 31 -; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; SI-NEXT: s_cmp_eq_u32 s3, 3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_or_b32 s3, s3, s8 +; SI-NEXT: s_lshr_b32 s2, s2, 2 +; SI-NEXT: s_add_i32 s2, s2, s3 +; SI-NEXT: s_cmp_lt_i32 s1, 31 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s1 -; SI-NEXT: s_lshr_b32 s1, s3, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s2 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_and_b32 s1, s4, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_and_b32 s1, s6, 0x7fff +; SI-NEXT: s_and_b32 s2, s5, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: @@ -4358,16 +4149,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v1, s0, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_and_b32 s1, s0, 0x7fff +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v1, s0, v1 +; SI-NEXT: v_or_b32_e32 v0, s1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -4424,19 +4211,16 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s2, s2, 0x80000000 +; SI-NEXT: s_and_b32 s1, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_and_b32 s2, s4, 0x80000000 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -4618,14 +4402,14 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v7 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v8 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: @@ -4687,24 +4471,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, < ; SI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4764,139 +4544,126 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0x1ff, v3 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffe, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffe, v8 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v10, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 -; SI-NEXT: v_or_b32_e32 v7, 0x1000, v2 -; SI-NEXT: v_med3_i32 v11, v11, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v7 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v8, 0x1000, v2 +; SI-NEXT: v_med3_i32 v9, v9, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 ; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v11, v2, v11 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; SI-NEXT: v_and_b32_e32 v11, 7, v7 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v3 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v2, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_and_b32_e32 v9, 7, v8 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; SI-NEXT: v_mov_b32_e32 v9, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_mov_b32_e32 v10, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 -; SI-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; SI-NEXT: v_and_b32_e32 v8, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v3 +; SI-NEXT: v_med3_i32 v8, v8, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v11, v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v10, v0, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; SI-NEXT: v_and_b32_e32 v8, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v4, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 +; SI-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v8, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v4 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v7, v1, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-NEXT: v_and_b32_e32 v5, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v8 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5394,25 +5161,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32: @@ -5471,19 +5233,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: @@ -5554,17 +5317,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f16_sign_v4f32(<4 x half> %mag, < ; SI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 ; SI-NEXT: v_bfi_b32 v2, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v7, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v6, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v6, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: @@ -5636,17 +5399,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, ; SI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: @@ -5721,10 +5484,10 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v9 @@ -5802,33 +5565,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, < ; SI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: @@ -5900,178 +5658,161 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v13, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 +; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v3 ; SI-NEXT: v_or_b32_e32 v12, 0x1000, v2 -; SI-NEXT: v_med3_i32 v14, v14, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v14, v12 +; SI-NEXT: v_med3_i32 v13, v13, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v14, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v12 ; SI-NEXT: s_movk_i32 s5, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v14, v2, v14 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_and_b32_e32 v14, 7, v12 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v14 -; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v3 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_or_b32_e32 v13, v2, v13 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SI-NEXT: v_and_b32_e32 v13, 7, v12 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 ; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; SI-NEXT: v_mov_b32_e32 v14, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 +; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; SI-NEXT: v_mov_b32_e32 v13, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SI-NEXT: v_mov_b32_e32 v14, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 +; SI-NEXT: v_cndmask_b32_e32 v2, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v12, v1, 20, 11 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 +; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v13, v13, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v3 +; SI-NEXT: v_med3_i32 v12, v12, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v15, v12, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 -; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; SI-NEXT: v_and_b32_e32 v13, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 -; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v15, v3 +; SI-NEXT: v_or_b32_e32 v12, v0, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; SI-NEXT: v_and_b32_e32 v12, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 +; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v12, v12, v15 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 +; SI-NEXT: v_cndmask_b32_e32 v0, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v7 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v6, v7, 20, 11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v6 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 -; SI-NEXT: v_med3_i32 v12, v12, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3 +; SI-NEXT: v_med3_i32 v7, v7, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v6 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v12, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v7, v1, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; SI-NEXT: v_and_b32_e32 v12, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v13, v14, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SI-NEXT: v_bfe_u32 v5, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v5 ; SI-NEXT: v_or_b32_e32 v4, 0x1000, v3 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v4 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v4 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v6, v3, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v4 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 ; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v11 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v10 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v9 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6724,33 +6465,26 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v7, v5 -; SI-NEXT: v_bfi_b32 v3, s4, v6, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32: @@ -6816,25 +6550,26 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: @@ -7268,7 +7003,7 @@ define half @v_copysign_f16_0_f64(double %sign) { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16_0_f64: @@ -7306,16 +7041,7 @@ define half @v_copysign_f16_0_f64(double %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x80008000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16_0_v2f16: @@ -7344,15 +7070,7 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2f16: @@ -7380,16 +7098,12 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -7444,12 +7158,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7499,13 +7209,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, 0x80000000, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: s_and_b32 s0, 0x80000000, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s1, 0x80000000 +; SI-NEXT: s_and_b32 s1, s3, 0x80000000 +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16_0_v2f64: @@ -7541,12 +7248,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2bf64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2bf64: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 1779c45203f47..fd5c47d36a752 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -2143,7 +2143,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 @@ -2155,21 +2154,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v5, v5 -; SI-NEXT: v_mul_f32_e32 v5, v3, v2 -; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2351,7 +2351,6 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 @@ -2363,21 +2362,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 -; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v5, v5 -; SI-NEXT: v_mul_f32_e32 v5, v3, v2 -; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index dcf0519dee355..262b6c53fa2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8690,24 +8690,22 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8718,7 +8716,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -9071,24 +9069,22 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9099,7 +9095,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9454,24 +9450,22 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9482,7 +9476,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9814,23 +9808,21 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10183,14 +10175,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10554,14 +10544,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10833,10 +10821,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11117,8 +11103,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11494,24 +11478,22 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11522,7 +11504,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11869,14 +11851,12 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16430,49 +16410,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -16626,49 +16596,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16836,49 +16796,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17023,41 +16973,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17211,41 +17152,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17416,41 +17348,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17612,49 +17535,39 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17808,41 +17721,32 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17997,49 +17901,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18183,41 +18077,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18371,49 +18256,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18557,41 +18432,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index a412a4eebe7ea..3919ba4e2b1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6350,24 +6350,22 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6378,7 +6376,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,24 +6754,22 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6784,7 +6780,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,24 +7160,22 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7192,7 +7186,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,23 +7538,21 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7938,14 +7930,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8334,14 +8324,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8644,8 +8632,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8935,10 +8921,8 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9339,24 +9323,22 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9367,7 +9349,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9739,14 +9721,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14358,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14602,49 +14572,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14862,49 +14822,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15096,41 +15046,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15331,41 +15272,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15586,41 +15518,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15830,49 +15753,39 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16073,41 +15986,32 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c05d76a63a1d4..858ff79ade52f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6350,24 +6350,22 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6378,7 +6376,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,24 +6754,22 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6784,7 +6780,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,24 +7160,22 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7192,7 +7186,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,23 +7538,21 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7938,14 +7930,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8334,14 +8324,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8644,8 +8632,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8935,10 +8921,8 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9339,24 +9323,22 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9367,7 +9349,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9739,14 +9721,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14358,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14602,49 +14572,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14862,49 +14822,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15096,41 +15046,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15331,41 +15272,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15586,41 +15518,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15830,49 +15753,39 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16073,41 +15986,32 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index d7c913cafd7d9..0fb799ea66461 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -6137,24 +6137,22 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6165,7 +6163,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst ret half %result @@ -6518,24 +6516,22 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6546,7 +6542,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -6901,24 +6897,22 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6929,7 +6923,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -7261,23 +7255,21 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7630,14 +7622,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8001,14 +7991,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8291,8 +8279,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8562,10 +8548,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8941,24 +8925,22 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8969,7 +8951,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val seq_cst @@ -9316,14 +9298,12 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13908,49 +13888,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -14135,49 +14105,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14378,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14593,41 +14543,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14809,41 +14750,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15045,41 +14977,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15272,49 +15195,39 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst @@ -15496,41 +15409,32 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 7afdf102f5295..9a3dc507f295b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1039,20 +1039,26 @@ define half @v_max3_f16_maximumnum_maximumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,40 +1124,52 @@ define <2 x half> @v_max3_v2f16_maximumnum_maximumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,23 +1255,32 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_max_f32_e32 v2, v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1262,23 +1289,32 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX7-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1397,70 +1433,94 @@ define <4 x half> @v_max3_v4f16_maximumnum_maximumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_max3_f32 v9, v11, v10, v9 -; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v8, v10, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v2, v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_max3_f32 v9, v11, v10, v9 -; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v8, v10, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 38ab4c2712a2c..94f7eee4a6efb 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -400,27 +400,30 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_max3_f32 v0, v0, v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -690,27 +693,30 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_max3_f32 v0, v2, v0, v1 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -977,20 +983,30 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_max_f32_e32 v1, v7, v6 -; SI-NEXT: v_max3_f32 v0, v2, v0, v3 -; SI-NEXT: v_max3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_max_f32_e32 v5, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_max_f32_e32 v0, v2, v0 +; SI-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index b187f39c786aa..1b494deca08aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -26,10 +26,10 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16: @@ -119,15 +119,16 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -176,14 +177,14 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_max_f32_e32 v1, v3, v2 +; SI-NEXT: v_max_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -235,19 +236,21 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_max_legacy_f32_e32 v2, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -304,17 +307,17 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 ; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max_f32_e32 v2, v5, v4 +; SI-NEXT: v_max_f32_e32 v2, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -381,28 +384,30 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_max_legacy_f32_e32 v2, v7, v6 -; SI-NEXT: v_max_legacy_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v15, v14 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v13, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -469,28 +474,28 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 ; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max_f32_e32 v2, v7, v6 -; SI-NEXT: v_max_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v2, v6, v7 +; SI-NEXT: v_max_f32_e32 v3, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -581,50 +586,54 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_max_legacy_f32_e32 v3, v7, v3 -; SI-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_max_legacy_f32_e32 v4, v15, v14 -; SI-NEXT: v_max_legacy_f32_e32 v5, v13, v12 -; SI-NEXT: v_max_legacy_f32_e32 v6, v11, v10 -; SI-NEXT: v_max_legacy_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cmp_nle_f32_e64 s[4:5], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_cmp_nle_f32_e64 s[6:7], v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cmp_nle_f32_e64 s[8:9], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v6, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v19, v18 +; SI-NEXT: v_cndmask_b32_e32 v6, v13, v12, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 @@ -721,50 +730,50 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_max_f32_e32 v3, v3, v7 ; SI-NEXT: v_max_f32_e32 v2, v2, v6 ; SI-NEXT: v_max_f32_e32 v1, v1, v5 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NEXT: v_max_f32_e32 v4, v15, v14 -; SI-NEXT: v_max_f32_e32 v5, v13, v12 -; SI-NEXT: v_max_f32_e32 v6, v11, v10 -; SI-NEXT: v_max_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v4, v14, v15 +; SI-NEXT: v_max_f32_e32 v5, v12, v13 +; SI-NEXT: v_max_f32_e32 v6, v10, v11 +; SI-NEXT: v_max_f32_e32 v7, v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index d8014962eb3bd..7a89b58f239a9 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -17,25 +17,15 @@ declare float @llvm.fabs.f32(float) #0 declare half @llvm.fabs.f16(half) #0 define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16: ; GFX8-SDAG: ; %bb.0: @@ -73,25 +63,15 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_flags: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_flags: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_flags: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_flags: ; GFX8-SDAG: ; %bb.0: @@ -129,29 +109,17 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v1, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v1, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GFX7-NEXT: flat_store_dword v[3:4], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use: ; GFX8: ; %bb.0: @@ -186,23 +154,14 @@ define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, p } define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, 2.0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, 2.0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, 2.0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0: ; GFX8-SDAG: ; %bb.0: @@ -238,23 +197,14 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, 2.0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, v0, 2.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k1: ; GFX8-SDAG: ; %bb.0: @@ -290,23 +240,14 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, 2.0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, 2.0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, 2.0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k2: ; GFX8-SDAG: ; %bb.0: @@ -526,25 +467,15 @@ define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: ; GFX8-SDAG: ; %bb.0: @@ -585,45 +516,15 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_fneg_f32_fpext_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_med3_f32 v0, -v0, -v1, -v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -636,25 +537,15 @@ define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v2, -|v2| -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_fneg_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -|v2| +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: ; GFX8-SDAG: ; %bb.0: @@ -758,9 +649,9 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX7-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -805,29 +696,17 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar } define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v5, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v5 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v5 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v5, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: v_med3_f32 v0, v5, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: ; GFX8-SDAG: ; %bb.0: @@ -874,29 +753,17 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, } define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: flat_store_dword v[3:4], v1 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: ; GFX8-SDAG: ; %bb.0: @@ -943,29 +810,17 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, } define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v2 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v2 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_dword v[3:4], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: ; GFX8-SDAG: ; %bb.0: @@ -1030,35 +885,15 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 { } define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext bfloat %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -1068,35 +903,15 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 } define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext bfloat %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -1106,35 +921,15 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 } define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext bfloat %arg2 to float @@ -1147,8 +942,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, s4, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1214,8 +1009,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, s4, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1281,8 +1076,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, s4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index b37ab370d0bbf..668347eb97004 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7449,7 +7449,12 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-SDAG-NEXT: s_endpgm @@ -7624,7 +7629,22 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-SDAG-NEXT: s_endpgm @@ -8694,24 +8714,16 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) { } define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) { -; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: ; VI-SDAG: ; %bb.0: @@ -8778,12 +8790,18 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v1, 4.0, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8994,11 +9012,30 @@ define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 -; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v3, 2.0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-SDAG-NEXT: v_max_f32_e32 v3, 2.0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_min_f32_e32 v3, 4.0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v4, 4.0, v1 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -9182,8 +9219,18 @@ define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index bb6b20df0c149..4f6369078c386 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1039,20 +1039,26 @@ define half @v_min3_f16_minimumnum_minimumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,40 +1124,52 @@ define <2 x half> @v_min3_v2f16_minimumnum_minimumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,23 +1255,32 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_min_f32_e32 v2, v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1262,23 +1289,32 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX7-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1397,70 +1433,94 @@ define <4 x half> @v_min3_v4f16_minimumnum_minimumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_min3_f32 v9, v11, v10, v9 -; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v8, v10, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v2, v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_min3_f32 v9, v11, v10, v9 -; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v8, v10, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index fee2fad933158..6be2eb93ee25c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -400,27 +400,30 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -690,27 +693,30 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -977,20 +983,30 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_min_f32_e32 v1, v7, v6 -; SI-NEXT: v_min3_f32 v0, v2, v0, v3 -; SI-NEXT: v_min3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_min_f32_e32 v5, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_min_f32_e32 v0, v2, v0 +; SI-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index dd77eb6f364a7..8c9dccceff192 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -27,10 +27,10 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16: @@ -120,15 +120,16 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -177,14 +178,14 @@ define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_min_f32_e32 v1, v3, v2 +; SI-NEXT: v_min_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -236,19 +237,21 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_min_legacy_f32_e32 v2, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -305,17 +308,17 @@ define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v2, v5, v4 +; SI-NEXT: v_min_f32_e32 v2, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -382,28 +385,30 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_min_legacy_f32_e32 v2, v7, v6 -; SI-NEXT: v_min_legacy_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -470,28 +475,28 @@ define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v2, v7, v6 -; SI-NEXT: v_min_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v2, v6, v7 +; SI-NEXT: v_min_f32_e32 v3, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -582,50 +587,54 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_min_legacy_f32_e32 v3, v7, v3 -; SI-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_min_legacy_f32_e32 v4, v15, v14 -; SI-NEXT: v_min_legacy_f32_e32 v5, v13, v12 -; SI-NEXT: v_min_legacy_f32_e32 v6, v11, v10 -; SI-NEXT: v_min_legacy_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cmp_ngt_f32_e64 s[4:5], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_cmp_ngt_f32_e64 s[6:7], v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cmp_ngt_f32_e64 s[8:9], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v19, v18 +; SI-NEXT: v_cndmask_b32_e32 v6, v13, v12, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 @@ -722,50 +731,50 @@ define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_min_f32_e32 v3, v3, v7 ; SI-NEXT: v_min_f32_e32 v2, v2, v6 ; SI-NEXT: v_min_f32_e32 v1, v1, v5 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NEXT: v_min_f32_e32 v4, v15, v14 -; SI-NEXT: v_min_f32_e32 v5, v13, v12 -; SI-NEXT: v_min_f32_e32 v6, v11, v10 -; SI-NEXT: v_min_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v4, v14, v15 +; SI-NEXT: v_min_f32_e32 v5, v12, v13 +; SI-NEXT: v_min_f32_e32 v6, v10, v11 +; SI-NEXT: v_min_f32_e32 v7, v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index f8719936b2d0a..082006898b436 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -598,31 +598,31 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v5, v7, v5 -; SI-NEXT: v_mul_f32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_mul_f32_e32 v1, v3, v1 ; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_mul_f32_e32 v2, v7, v5 +; SI-NEXT: v_mul_f32_e32 v3, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -713,37 +713,37 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3 -; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_add_f32_e32 v0, v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 4.0, v3 +; SI-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 2079ee54653ce..16ec854a12c53 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -22,13 +22,13 @@ define half @v_fneg_add_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -84,9 +84,9 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_add_store_use_add_f16: @@ -131,20 +131,22 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -210,7 +212,8 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16: @@ -273,7 +276,8 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16: @@ -333,11 +337,11 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: @@ -400,11 +404,10 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v2, -v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-SAFE-NEXT: v_mov_b32_e32 v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v2 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: @@ -412,10 +415,9 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -483,28 +485,27 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, v3, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -574,28 +575,38 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, ; SI-LABEL: fneg_fadd_0_safe_f16: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, 0x7e00 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_safe_f16: @@ -644,14 +655,20 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_mov_b32_e32 v3, s0 ; SI-NEXT: v_rcp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, 0x7e00 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_nsz_f16: @@ -699,9 +716,8 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -738,9 +754,9 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_store_use_mul_f16: @@ -782,12 +798,12 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_multi_use_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -905,9 +921,8 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -946,10 +961,9 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mul_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -997,14 +1011,13 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1057,8 +1070,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -1106,8 +1117,6 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -1144,8 +1153,7 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_ieee: @@ -1178,8 +1186,7 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_no_ieee: @@ -1212,7 +1219,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1255,7 +1261,6 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1291,7 +1296,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1334,7 +1338,6 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1372,7 +1375,8 @@ define half @v_fneg_0_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_minnum_f16: @@ -1412,7 +1416,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1455,7 +1458,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1502,7 +1504,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1549,7 +1550,6 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1588,7 +1588,9 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,10 +1636,11 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1687,7 +1690,9 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1729,13 +1734,12 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,18 +1788,34 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) } define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) #4 { -; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-SAFE-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-NSZ-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: +; SI-NSZ: ; %bb.0: +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, v0 mul:4 +; SI-NSZ-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: ; VI: ; %bb.0: @@ -1846,8 +1866,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -1895,8 +1913,6 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -1933,8 +1949,7 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_ieee: @@ -1967,8 +1982,7 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee: @@ -2001,7 +2015,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2044,7 +2057,6 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2080,7 +2092,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2123,7 +2134,6 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2161,7 +2171,8 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_maxnum_f16: @@ -2201,7 +2212,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2244,7 +2254,6 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2283,7 +2292,9 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2332,7 +2343,9 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2387,12 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2429,18 +2441,34 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) } define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) #4 { -; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-SAFE-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-NSZ-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: +; SI-NSZ: ; %bb.0: +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, v0 mul:4 +; SI-NSZ-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: ; VI: ; %bb.0: @@ -2492,20 +2520,111 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_f16: @@ -2556,11 +2675,56 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fma_store_use_fma_f16: @@ -2603,23 +2767,116 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) # ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -2682,20 +2939,111 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: @@ -2747,20 +3095,111 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: @@ -2812,20 +3251,111 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: @@ -2877,21 +3407,112 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: @@ -2943,21 +3564,112 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: @@ -3008,26 +3720,115 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v3, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v3, 0xffff8000, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[6:7], v[4:5], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mov_b32_e32 v1, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[5:6], v[3:4], v[1:2] +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0x1ff, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v4, v3, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: @@ -3094,30 +3895,119 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v8, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[6:7], v[4:5], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x8000, v1 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v8, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NSZ-NEXT: v_fma_f64 v[1:2], v[6:7], v[4:5], v[1:2] +; SI-NSZ-NEXT: v_and_b32_e32 v4, 0x1ff, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffe, v4 +; SI-NSZ-NEXT: v_bfe_u32 v5, v2, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NSZ-NEXT: v_sub_i32_e32 v6, vcc, s4, v5 +; SI-NSZ-NEXT: v_or_b32_e32 v4, 0x1000, v1 +; SI-NSZ-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v5, vcc, s4, v5 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v6, v1, v6 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v6, 7, v4 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; SI-NSZ-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0x8000, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: @@ -3189,21 +4079,27 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -3254,35 +4150,47 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c ; SI-SAFE-LABEL: v_fneg_fmad_v4f32: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_add_f32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-SAFE-NEXT: v_mul_f32_e32 v3, v7, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v9, v11, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_mac_f32_e32 v5, v1, v3 -; SI-SAFE-NEXT: v_mac_f32_e32 v6, v8, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-SAFE-NEXT: v_mac_f32_e32 v4, v0, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v4 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 @@ -3291,38 +4199,50 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c ; SI-NSZ-LABEL: v_fneg_fmad_v4f32: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NSZ-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NSZ-NEXT: v_mul_f32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v7, v7, v9 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v10 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NSZ-NEXT: v_mad_f32 v6, v7, v11, -v6 -; SI-NSZ-NEXT: v_mad_f32 v8, v9, v10, -v8 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v2, -v4 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NSZ-NEXT: v_mad_f32 v1, v1, v3, -v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NSZ-NEXT: v_sub_f32_e32 v6, v6, v8 +; SI-NSZ-NEXT: v_sub_f32_e32 v2, v7, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v8 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NSZ-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fmad_v4f32: @@ -3390,25 +4310,33 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) ; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -3474,7 +4402,6 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 { ; SI-LABEL: v_fneg_fp_extend_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3564,8 +4491,7 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3617,10 +4543,10 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) # ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: @@ -3673,11 +4599,11 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 -; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 4.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: @@ -3820,7 +4746,6 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_f64_to_f16: @@ -3972,7 +4897,6 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16: @@ -4125,7 +5049,6 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4387,7 +5310,6 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4546,9 +5468,8 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: @@ -4761,7 +5682,6 @@ define half @v_fneg_trunc_f16(half %a) #0 { ; SI-LABEL: v_fneg_trunc_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_trunc_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4798,33 +5718,20 @@ define half @v_fneg_trunc_f16(half %a) #0 { ; -------------------------------------------------------------------------------- define half @v_fneg_round_f16(half %a) #0 { -; SI-SAFE-LABEL: v_fneg_round_f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; SI-SAFE-NEXT: s_brev_b32 s4, -2 -; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 -; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NSZ-LABEL: v_fneg_round_f16: -; SI-NSZ: ; %bb.0: -; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 -; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; SI-NSZ-NEXT: s_brev_b32 s4, -2 -; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 -; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_fneg_round_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v1, v0 +; SI-NEXT: v_sub_f32_e32 v2, v0, v1 +; SI-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v0, s4, v2, v0 +; SI-NEXT: v_add_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_round_f16: ; VI-SAFE: ; %bb.0: @@ -4927,7 +5834,6 @@ define half @v_fneg_rint_f16(half %a) #0 { ; SI-LABEL: v_fneg_rint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4967,7 +5873,6 @@ define half @v_fneg_nearbyint_f16(half %a) #0 { ; SI-LABEL: v_fneg_nearbyint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5007,7 +5912,6 @@ define half @v_fneg_sin_f16(half %a) #0 { ; SI-LABEL: v_fneg_sin_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; SI-NEXT: v_fract_f32_e32 v0, v0 @@ -5057,7 +5961,6 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5101,24 +6004,21 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_cbranch_execz .LBB81_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e64 v4, -v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v2 ; SI-NEXT: v_mul_f32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: flat_store_short v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5235,9 +6135,8 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 { ; SI-LABEL: v_fneg_inlineasm_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ;;#ASMSTART @@ -5296,8 +6195,8 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NEXT: ;;#ASMSTART ; SI-NEXT: ; use v1 ; SI-NEXT: ;;#ASMEND @@ -5363,12 +6262,94 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, -v0, v1, v2 -; SI-NEXT: v_fma_f32 v2, -v0, v2, 2.0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_movk_i32 s5, 0xfc10 +; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 +; SI-NEXT: v_med3_i32 v8, v8, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v8, v2, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_and_b32_e32 v8, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v2 +; SI-NEXT: v_fma_f64 v[1:2], v[4:5], v[0:1], 2.0 +; SI-NEXT: v_or_b32_e32 v0, v3, v6 +; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 +; SI-NEXT: v_bfe_u32 v4, v2, 20, 11 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; SI-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-NEXT: v_and_b32_e32 v5, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop3_users_f16: @@ -5417,14 +6398,13 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) ; SI-LABEL: multiuse_fneg_2_vop2_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e64 v3, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_mul_f32_e32 v0, v3, v1 +; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop2_users_f16: @@ -5472,14 +6452,57 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, ; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_fma_f32 v1, v0, v1, 2.0 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v5, -v2 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], 2.0 +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16: @@ -5527,28 +6550,120 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-SAFE-NEXT: v_fma_f32 v0, v1, v0, 2.0 -; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v0, v2 -; SI-SAFE-NEXT: v_mul_f32_e64 v2, -v0, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], 2.0 +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NSZ-NEXT: v_fma_f32 v0, v1, -v0, -2.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v0, v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v2, v0, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], -2.0 +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: @@ -5620,12 +6735,59 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h ; SI-LABEL: one_use_cost_to_fold_into_src_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_trunc_f32_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_trunc_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v3, v3, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v3, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: one_use_cost_to_fold_into_src_f16: @@ -5670,15 +6832,63 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha ; SI-LABEL: multi_use_cost_to_fold_into_src: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_trunc_f32_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 -; SI-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v8 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v3, v3, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v6, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v3, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multi_use_cost_to_fold_into_src: @@ -5733,27 +6943,115 @@ define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %ar ; SI-LABEL: fneg_fma_fneg_dagcombine_loop: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_xor_b32_e32 v7, 0x80008000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_brev_b32 s5, 1 +; SI-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; SI-NEXT: s_movk_i32 s6, 0x3f1 +; SI-NEXT: s_movk_i32 s7, 0xfc10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_fma_f64 v[3:4], v[3:4], v[5:6], s[4:5] ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 +; SI-NEXT: v_bfe_u32 v6, v4, 20, 11 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v5, 0x1000, v3 +; SI-NEXT: v_med3_i32 v10, v10, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v11, v10, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v6 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v10, v3, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; SI-NEXT: v_and_b32_e32 v10, 7, v5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; SI-NEXT: v_mov_b32_e32 v10, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; SI-NEXT: v_mov_b32_e32 v11, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_movk_i32 s8, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 +; SI-NEXT: v_cndmask_b32_e32 v12, v5, v3, vcc +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[3:4], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_fma_f64 v[3:4], v[5:6], v[3:4], s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; SI-NEXT: v_bfe_u32 v7, v4, 20, 11 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v3 +; SI-NEXT: v_med3_i32 v9, v9, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v12, v9, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_or_b32_e32 v9, v3, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; SI-NEXT: v_and_b32_e32 v9, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, 1 -; SI-NEXT: v_fma_f32 v5, v5, v7, s4 -; SI-NEXT: v_sub_f32_e32 v4, v5, v4 -; SI-NEXT: v_fma_f32 v1, v1, v2, s4 -; SI-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-NEXT: v_mul_f32_e32 v1, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_mul_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, v0, v6 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -5796,7 +7094,6 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: nnan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -5835,7 +7132,6 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denormal_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -5873,11 +7169,12 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denorm_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5920,9 +7217,10 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: flush_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5965,12 +7263,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6023,13 +7320,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 5d23f648f707b..410316b1d4d76 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7777,12 +7777,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7810,13 +7809,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index afe0b8c3b392b..ca2aa47fbcf5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -938,7 +938,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -960,7 +959,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1491,7 +1489,6 @@ define half @v_fneg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NEXT: v_max_f32_e32 v2, 0xbe230000, v0 @@ -1518,7 +1515,6 @@ define half @v_fneg_neg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NEXT: v_max_f32_e32 v2, 0x3e230000, v0 @@ -2084,7 +2080,6 @@ define half @v_fneg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2106,7 +2101,6 @@ define half @v_fneg_neg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3759,12 +3753,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3792,13 +3785,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4112,14 +4105,14 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_bitcmp1_b32 s2, 16 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: s_and_b32 s3, 0x10000, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_xor_b32 s2, s2, 0x8000 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -4149,12 +4142,11 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { ; SI-LABEL: v_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_select_infloop_regression_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index cbd4017c6cf1c..00d53cd265c28 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0x7fff ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s0| ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index d9dea4f1fd6e7..9b44acd5c0716 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -601,16 +601,14 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1 ; GFX7-LABEL: select_fneg_select_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_f16: @@ -720,37 +718,23 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX7-LABEL: select_fneg_select_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_bfi_b32 v4, s4, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_v2f16: @@ -1320,11 +1304,11 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index cab27fca5ab0a..3140b87c8108e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -631,17 +631,17 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 -; CI-NEXT: v_sub_f32_e32 v1, 2.0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: flat_store_short v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 7ff5bbf4821b7..303864ff9434a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -599,11 +599,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x204 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s4| +; SI-NEXT: v_cmp_eq_f32_e32 vcc, s5, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -642,16 +642,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s4, s[4:5], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x1f8 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s1, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, |s0| +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cmp_neq_f32_e64 s[0:1], s1, v1 +; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0_f16: @@ -695,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x1f8 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s4| +; SI-NEXT: v_cmp_lg_f32_e32 vcc, s5, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 8df756481e54a..94c2d3364a769 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -184,12 +184,12 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -681,7 +681,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -985,13 +985,13 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1189,10 +1189,10 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll index 3f1aea2e3773d..862c7ac9f762b 100644 --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -518,8 +518,8 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 @@ -660,23 +660,23 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index a043d537fbc45..ac269ee0d5abe 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -285,8 +285,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -384,8 +384,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -487,8 +487,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index d8660617c7677..9016d4fd67d62 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1580,7 +1580,11 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_floor_f32_e32 v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1590,7 +1594,11 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_floor_f32_e32 v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1673,10 +1681,18 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_floor_f32_e32 v3, v1 -; GFX6-NEXT: v_floor_f32_e32 v2, v0 -; GFX6-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_floor_f32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_floor_f32_e32 v3, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 @@ -1691,10 +1707,18 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_floor_f32_e32 v3, v1 -; GFX7-NEXT: v_floor_f32_e32 v2, v0 -; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_floor_f32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_floor_f32_e32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 @@ -1859,38 +1883,44 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly ; GFX6-LABEL: safe_math_fract_f16_noinf_check: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_f16_noinf_check: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2486,44 +2516,52 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX6-LABEL: safe_math_fract_f16: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e64 v6, |v0| +; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_f16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v6, |v0| +; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2654,38 +2692,45 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_floor_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_floor_f32_e32 v7, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_sub_f32_e32 v7, v4, v7 -; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_floor_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_floor_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX6-NEXT: v_sub_f32_e32 v8, v4, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_sub_f32_e32 v9, v5, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v0, vcc ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,38 +2738,45 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_floor_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_floor_f32_e32 v7, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_sub_f32_e32 v7, v4, v7 -; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX7-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-NEXT: v_floor_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_floor_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_sub_f32_e32 v8, v4, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_sub_f32_e32 v9, v5, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v0, vcc ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index fcadfcdd087be..2ee95e535b04a 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -5414,21 +5414,21 @@ define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-GISEL-LABEL: freeze_i256: -; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: freeze_i256: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: freeze_i256: ; GFX9-GISEL: ; %bb.0: @@ -6156,6 +6156,8 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -6181,6 +6183,8 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -6352,6 +6356,15 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -6381,6 +6394,15 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 88f6427d94042..374747ada621b 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -24,37 +24,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v2 ; SI-NEXT: s_cbranch_vccz .LBB0_2 ; SI-NEXT: ; %bb.1: ; %frem.else -; SI-NEXT: v_and_b32_e32 v5, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_8 ; SI-NEXT: .LBB0_2: -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v3 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v3, v4 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s0, v3 ; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SI-NEXT: v_frexp_mant_f32_e32 v3, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s3 ; SI-NEXT: v_frexp_mant_f32_e32 v4, v2 @@ -111,25 +106,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v4, s0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 ; SI-NEXT: .LBB0_8: ; %Flow19 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-NEXT: s_mov_b32 s0, 0x7f800000 ; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -143,37 +133,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s10 ; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; CI-NEXT: v_and_b32_e32 v4, 0x7fffffff, v0 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1| -; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB0_2 ; CI-NEXT: ; %bb.1: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_and_b32_e32 v5, 0x80000000, v0 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x8000, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; CI-NEXT: s_cbranch_execz .LBB0_3 ; CI-NEXT: s_branch .LBB0_8 ; CI-NEXT: .LBB0_2: -; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: .LBB0_3: ; %frem.compute -; CI-NEXT: v_frexp_mant_f32_e32 v3, v4 -; CI-NEXT: v_ldexp_f32_e64 v5, v3, 11 -; CI-NEXT: v_frexp_mant_f32_e32 v3, v2 -; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v4, 1 ; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; CI-NEXT: v_ldexp_f32_e64 v5, v2, 11 ; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; CI-NEXT: v_not_b32_e32 v4, v2 ; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 @@ -219,25 +204,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_bfi_b32 v3, s0, v2, v0 +; CI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: .LBB0_8: ; %Flow19 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_mov_b32 s0, 0x7f800000 -; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s0, 0x7f800000 +; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -1248,84 +1228,181 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; SI-NEXT: v_rcp_f32_e32 v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v0, v3 +; SI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s5, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffe +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s5, 0x1000 +; SI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; SI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; SI-NEXT: v_med3_i32 v0, s8, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s7, 12 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_movk_i32 s5, 0x7e00 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; CI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v0, v3 +; CI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; CI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; CI-NEXT: v_readfirstlane_b32 s4, v1 +; CI-NEXT: s_and_b32 s5, s4, 0x1ff +; CI-NEXT: v_or_b32_e32 v0, s5, v0 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-NEXT: s_lshr_b32 s5, s4, 8 +; CI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; CI-NEXT: s_and_b32 s5, s5, 0xffe +; CI-NEXT: v_readfirstlane_b32 s6, v0 +; CI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; CI-NEXT: s_or_b32 s5, s5, s6 +; CI-NEXT: v_med3_i32 v0, s8, 0, 13 +; CI-NEXT: s_or_b32 s6, s5, 0x1000 +; CI-NEXT: v_readfirstlane_b32 s8, v0 +; CI-NEXT: s_lshr_b32 s9, s6, s8 +; CI-NEXT: s_lshl_b32 s8, s9, s8 +; CI-NEXT: s_cmp_lg_u32 s8, s6 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: s_addk_i32 s7, 0xfc10 +; CI-NEXT: s_lshl_b32 s8, s7, 12 +; CI-NEXT: s_or_b32 s6, s9, s6 +; CI-NEXT: s_or_b32 s8, s5, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 1 +; CI-NEXT: s_cselect_b32 s6, s6, s8 +; CI-NEXT: s_and_b32 s8, s6, 7 +; CI-NEXT: s_cmp_gt_i32 s8, 5 +; CI-NEXT: s_cselect_b32 s9, 1, 0 +; CI-NEXT: s_cmp_eq_u32 s8, 3 +; CI-NEXT: s_cselect_b32 s8, 1, 0 +; CI-NEXT: s_or_b32 s8, s8, s9 +; CI-NEXT: s_lshr_b32 s6, s6, 2 +; CI-NEXT: s_add_i32 s6, s6, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 31 +; CI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; CI-NEXT: s_cmp_lg_u32 s5, 0 +; CI-NEXT: s_movk_i32 s5, 0x7e00 +; CI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; CI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; CI-NEXT: s_cselect_b32 s5, s5, s6 +; CI-NEXT: s_lshr_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s4, s4, 0x8000 +; CI-NEXT: s_or_b32 s4, s4, s5 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f16: @@ -1649,84 +1726,181 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; SI-NEXT: v_rcp_f32_e32 v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v0, v3 +; SI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s5, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffe +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s5, 0x1000 +; SI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; SI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; SI-NEXT: v_med3_i32 v0, s8, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s7, 12 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_movk_i32 s5, 0x7e00 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; CI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v0, v3 +; CI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; CI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; CI-NEXT: v_readfirstlane_b32 s4, v1 +; CI-NEXT: s_and_b32 s5, s4, 0x1ff +; CI-NEXT: v_or_b32_e32 v0, s5, v0 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-NEXT: s_lshr_b32 s5, s4, 8 +; CI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; CI-NEXT: s_and_b32 s5, s5, 0xffe +; CI-NEXT: v_readfirstlane_b32 s6, v0 +; CI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; CI-NEXT: s_or_b32 s5, s5, s6 +; CI-NEXT: v_med3_i32 v0, s8, 0, 13 +; CI-NEXT: s_or_b32 s6, s5, 0x1000 +; CI-NEXT: v_readfirstlane_b32 s8, v0 +; CI-NEXT: s_lshr_b32 s9, s6, s8 +; CI-NEXT: s_lshl_b32 s8, s9, s8 +; CI-NEXT: s_cmp_lg_u32 s8, s6 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: s_addk_i32 s7, 0xfc10 +; CI-NEXT: s_lshl_b32 s8, s7, 12 +; CI-NEXT: s_or_b32 s6, s9, s6 +; CI-NEXT: s_or_b32 s8, s5, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 1 +; CI-NEXT: s_cselect_b32 s6, s6, s8 +; CI-NEXT: s_and_b32 s8, s6, 7 +; CI-NEXT: s_cmp_gt_i32 s8, 5 +; CI-NEXT: s_cselect_b32 s9, 1, 0 +; CI-NEXT: s_cmp_eq_u32 s8, 3 +; CI-NEXT: s_cselect_b32 s8, 1, 0 +; CI-NEXT: s_or_b32 s8, s8, s9 +; CI-NEXT: s_lshr_b32 s6, s6, 2 +; CI-NEXT: s_add_i32 s6, s6, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 31 +; CI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; CI-NEXT: s_cmp_lg_u32 s5, 0 +; CI-NEXT: s_movk_i32 s5, 0x7e00 +; CI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; CI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; CI-NEXT: s_cselect_b32 s5, s5, s6 +; CI-NEXT: s_lshr_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s4, s4, 0x8000 +; CI-NEXT: s_or_b32 s4, s4, s5 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f16: @@ -4827,67 +5001,58 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v3 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v4 ; SI-NEXT: s_cbranch_vccz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %frem.else20 -; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v4 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB9_3 ; SI-NEXT: s_branch .LBB9_8 ; SI-NEXT: .LBB9_2: -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB9_3: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v3 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s0, v2 ; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SI-NEXT: v_ldexp_f32_e64 v5, v4, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v6 +; SI-NEXT: v_frexp_mant_f32_e32 v2, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v2, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v4 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: v_readfirstlane_b32 s0, v4 ; SI-NEXT: s_cselect_b32 s3, s0, 0 ; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; SI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; SI-NEXT: s_not_b32 s1, s0 ; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v4, 1.0 -; SI-NEXT: v_div_scale_f32 v7, s[4:5], v4, v4, 1.0 -; SI-NEXT: v_rcp_f32_e32 v8, v7 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; SI-NEXT: v_fma_f32 v8, v9, v8, v8 -; SI-NEXT: v_mul_f32_e32 v9, v6, v8 -; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; SI-NEXT: v_fma_f32 v9, v10, v8, v9 -; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 -; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_7 ; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader @@ -4895,45 +5060,44 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_add_i32 s1, s1, 11 ; SI-NEXT: .LBB9_5: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mul_f32_e32 v5, v7, v6 -; SI-NEXT: v_rndne_f32_e32 v5, v5 -; SI-NEXT: v_fma_f32 v5, -v5, v4, v7 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 -; SI-NEXT: v_add_f32_e32 v8, v5, v4 -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 ; SI-NEXT: s_cbranch_scc1 .LBB9_5 ; SI-NEXT: ; %bb.6: ; %Flow55 -; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1 -; SI-NEXT: v_mul_f32_e32 v6, v5, v6 -; SI-NEXT: v_rndne_f32_e32 v6, v6 -; SI-NEXT: v_fma_f32 v5, -v6, v4, v5 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 -; SI-NEXT: v_add_f32_e32 v4, v5, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SI-NEXT: v_ldexp_f32_e64 v4, v4, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v4, s0, v4, v2 -; SI-NEXT: .LBB9_8: -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v6, |v5| -; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7| +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s1 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: .LBB9_8: ; %Flow58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v3| +; SI-NEXT: v_cvt_f32_f16_e64 v7, |v4| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 ; SI-NEXT: s_cbranch_vccz .LBB9_10 ; SI-NEXT: ; %bb.9: ; %frem.else -; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v3 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v7 -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB9_11 ; SI-NEXT: s_branch .LBB9_16 @@ -5005,38 +5169,29 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0x8000, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: .LBB9_16: ; %Flow54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; SI-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 -; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: s_mov_b32 s2, 0x7f800000 ; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -5050,101 +5205,91 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s10 ; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v2 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3| -; CI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v2 -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB9_2 ; CI-NEXT: ; %bb.1: ; %frem.else20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_and_b32_e32 v7, 0x80000000, v2 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CI-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; CI-NEXT: s_cbranch_execz .LBB9_3 ; CI-NEXT: s_branch .LBB9_8 ; CI-NEXT: .LBB9_2: -; CI-NEXT: ; implicit-def: $vgpr4 +; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: .LBB9_3: ; %frem.compute19 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 -; CI-NEXT: v_frexp_mant_f32_e32 v4, v6 -; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 -; CI-NEXT: v_ldexp_f32_e64 v5, v6, 1 -; CI-NEXT: v_div_scale_f32 v11, s[0:1], v5, v5, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v7, v4, 11 -; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v10 -; CI-NEXT: v_not_b32_e32 v6, v4 -; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 -; CI-NEXT: v_rcp_f32_e32 v12, v11 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v4, 1 +; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v5, v2, 11 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; CI-NEXT: v_fma_f32 v12, v13, v12, v12 -; CI-NEXT: v_mul_f32_e32 v13, v8, v12 -; CI-NEXT: v_fma_f32 v14, -v11, v13, v8 -; CI-NEXT: v_fma_f32 v13, v14, v12, v13 -; CI-NEXT: v_fma_f32 v8, -v11, v13, v8 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 -; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_7 ; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader -; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 -; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v4 ; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v9, v7 -; CI-NEXT: v_mul_f32_e32 v7, v9, v8 -; CI-NEXT: v_rndne_f32_e32 v7, v7 -; CI-NEXT: v_fma_f32 v7, -v7, v5, v9 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 -; CI-NEXT: v_add_f32_e32 v10, v7, v5 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc -; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 -; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 ; CI-NEXT: s_cbranch_vccnz .LBB9_5 ; CI-NEXT: ; %bb.6: ; %Flow55 -; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: v_mov_b32_e32 v5, v7 ; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 -; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6 -; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 -; CI-NEXT: v_mul_f32_e32 v7, v6, v8 -; CI-NEXT: v_rndne_f32_e32 v7, v7 -; CI-NEXT: v_fma_f32 v6, -v7, v5, v6 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 -; CI-NEXT: v_add_f32_e32 v5, v6, v5 -; CI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_bfi_b32 v4, s0, v4, v2 -; CI-NEXT: .LBB9_8: -; CI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v5| -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: .LBB9_8: ; %Flow58 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v3| +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v4| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; CI-NEXT: s_cbranch_vccz .LBB9_10 ; CI-NEXT: ; %bb.9: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v0 +; CI-NEXT: v_and_b32_e32 v5, 0x8000, v3 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc ; CI-NEXT: s_cbranch_execz .LBB9_11 ; CI-NEXT: s_branch .LBB9_16 ; CI-NEXT: .LBB9_10: @@ -5203,38 +5348,29 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; CI-NEXT: v_and_b32_e32 v6, 0x8000, v3 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; CI-NEXT: v_or_b32_e32 v5, v5, v6 ; CI-NEXT: .LBB9_16: ; %Flow54 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_mov_b32 s2, 0x7f800000 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: s_mov_b32 s2, 0x7f800000 ; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -7139,800 +7275,754 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s10 -; SI-NEXT: s_mov_b32 s5, s11 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 +; SI-NEXT: v_readfirstlane_b32 s2, v1 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_cvt_f32_f16_e64 v3, |s3| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v6 -; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v7 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: s_cbranch_vccz .LBB10_2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v0| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; SI-NEXT: s_cbranch_vccz .LBB10_3 ; SI-NEXT: ; %bb.1: ; %frem.else86 -; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SI-NEXT: s_and_b32 s4, s3, 0xffff8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s4, s4, s3 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_3 -; SI-NEXT: s_branch .LBB10_8 -; SI-NEXT: .LBB10_2: -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: ; %bb.2: +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_branch .LBB10_9 +; SI-NEXT: .LBB10_3: +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_3: ; %frem.compute85 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 +; SI-NEXT: .LBB10_4: ; %frem.compute85 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s5 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v3 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v8 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v8, v9 -; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v8, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v8, v10 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v10 +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: s_cselect_b32 s4, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s5 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v10 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v2 +; SI-NEXT: s_cselect_b32 s5, s0, 0 +; SI-NEXT: s_add_i32 s0, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v2, v4, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 -; SI-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 -; SI-NEXT: v_rcp_f32_e32 v12, v11 +; SI-NEXT: s_add_i32 s1, s1, s4 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[6:7], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; SI-NEXT: v_fma_f32 v12, v13, v12, v12 -; SI-NEXT: v_mul_f32_e32 v13, v10, v12 -; SI-NEXT: v_fma_f32 v14, -v11, v13, v10 -; SI-NEXT: v_fma_f32 v13, v14, v12, v13 -; SI-NEXT: v_fma_f32 v10, -v11, v13, v10 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 -; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_8 +; SI-NEXT: ; %bb.5: ; %frem.loop_body93.preheader +; SI-NEXT: s_sub_i32 s1, s4, s5 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_5: ; %frem.loop_body93 +; SI-NEXT: .LBB10_6: ; %frem.loop_body93 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mul_f32_e32 v9, v11, v10 -; SI-NEXT: v_rndne_f32_e32 v9, v9 -; SI-NEXT: v_fma_f32 v9, -v9, v8, v11 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 -; SI-NEXT: v_add_f32_e32 v12, v9, v8 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v9, 11 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_5 -; SI-NEXT: ; %bb.6: ; %Flow133 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB10_7: ; %frem.loop_exit94 +; SI-NEXT: s_cbranch_scc1 .LBB10_6 +; SI-NEXT: ; %bb.7: ; %Flow133 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: .LBB10_8: ; %frem.loop_exit94 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1 -; SI-NEXT: v_mul_f32_e32 v10, v9, v10 -; SI-NEXT: v_rndne_f32_e32 v10, v10 -; SI-NEXT: v_fma_f32 v9, -v10, v8, v9 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 -; SI-NEXT: v_add_f32_e32 v8, v9, v8 -; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v8, v8, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v8, s0, v8, v6 -; SI-NEXT: .LBB10_8: -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e64 v10, |v9| -; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11 -; SI-NEXT: s_cbranch_vccz .LBB10_10 -; SI-NEXT: ; %bb.9: ; %frem.else53 -; SI-NEXT: v_and_b32_e32 v12, 0x80000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s1 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: s_and_b32 s0, s3, 0xffff8000 +; SI-NEXT: v_or_b32_e32 v3, s0, v2 +; SI-NEXT: .LBB10_9: ; %Flow136 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_lshr_b32 s5, s3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.10: +; SI-NEXT: v_cvt_f32_f16_e64 v6, |s5| +; SI-NEXT: v_cvt_f32_f16_e64 v5, |v4| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; SI-NEXT: s_cbranch_vccz .LBB10_13 +; SI-NEXT: ; %bb.11: ; %frem.else53 +; SI-NEXT: s_and_b32 s6, s5, 0x8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s5 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_11 -; SI-NEXT: s_branch .LBB10_16 -; SI-NEXT: .LBB10_10: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_cbranch_execz .LBB10_14 +; SI-NEXT: ; %bb.12: +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_branch .LBB10_19 +; SI-NEXT: .LBB10_13: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_11: ; %frem.compute52 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 +; SI-NEXT: .LBB10_14: ; %frem.compute52 +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v9 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v9, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v9, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v9, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v11 +; SI-NEXT: v_readfirstlane_b32 s0, v7 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v7, v5 +; SI-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v5 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v11 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v9, v9, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v5 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v5, v7, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v9, 1.0 -; SI-NEXT: v_div_scale_f32 v12, s[4:5], v9, v9, 1.0 -; SI-NEXT: v_rcp_f32_e32 v13, v12 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0 +; SI-NEXT: v_div_scale_f32 v8, s[10:11], v5, v5, 1.0 +; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 -; SI-NEXT: v_fma_f32 v13, v14, v13, v13 -; SI-NEXT: v_mul_f32_e32 v14, v11, v13 -; SI-NEXT: v_fma_f32 v15, -v12, v14, v11 -; SI-NEXT: v_fma_f32 v14, v15, v13, v14 -; SI-NEXT: v_fma_f32 v11, -v12, v14, v11 +; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v10, v9, v9 +; SI-NEXT: v_mul_f32_e32 v10, v7, v9 +; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v10, v11, v9, v10 +; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v11, v11, v13, v14 -; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_18 +; SI-NEXT: ; %bb.15: ; %frem.loop_body60.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_13: ; %frem.loop_body60 +; SI-NEXT: .LBB10_16: ; %frem.loop_body60 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v12, v10 -; SI-NEXT: v_mul_f32_e32 v10, v12, v11 -; SI-NEXT: v_rndne_f32_e32 v10, v10 -; SI-NEXT: v_fma_f32 v10, -v10, v9, v12 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; SI-NEXT: v_add_f32_e32 v13, v10, v9 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v10, 11 -; SI-NEXT: s_add_i32 s1, s1, -11 -; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_13 -; SI-NEXT: ; %bb.14: ; %Flow129 -; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB10_15: ; %frem.loop_exit61 -; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1 -; SI-NEXT: v_mul_f32_e32 v11, v10, v11 -; SI-NEXT: v_rndne_f32_e32 v11, v11 -; SI-NEXT: v_fma_f32 v10, -v11, v9, v10 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; SI-NEXT: v_add_f32_e32 v9, v10, v9 -; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v9, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v9, s0, v9, v4 -; SI-NEXT: .LBB10_16: -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e64 v11, |v10| -; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12 -; SI-NEXT: s_cbranch_vccz .LBB10_18 -; SI-NEXT: ; %bb.17: ; %frem.else20 -; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v12 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mul_f32_e32 v6, v8, v7 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v6, -v6, v5, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v9, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_16 +; SI-NEXT: ; %bb.17: ; %Flow129 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB10_18: ; %frem.loop_exit61 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1 +; SI-NEXT: v_mul_f32_e32 v7, v6, v7 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v5, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; SI-NEXT: s_and_b32 s0, s5, 0x8000 +; SI-NEXT: v_or_b32_e32 v5, s0, v5 +; SI-NEXT: .LBB10_19: +; SI-NEXT: v_cvt_f32_f16_e64 v7, |s2| +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: s_cbranch_vccz .LBB10_22 +; SI-NEXT: ; %bb.20: ; %frem.else20 +; SI-NEXT: s_and_b32 s6, s2, 0xffff8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s2 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_19 -; SI-NEXT: s_branch .LBB10_24 -; SI-NEXT: .LBB10_18: -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_cbranch_execz .LBB10_23 +; SI-NEXT: ; %bb.21: +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_branch .LBB10_28 +; SI-NEXT: .LBB10_22: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_19: ; %frem.compute19 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11 +; SI-NEXT: .LBB10_23: ; %frem.compute19 +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v10 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v10, v11 -; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v10, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v10, v12 -; SI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s0, v8 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v6 +; SI-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v6 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v12 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v10, v10, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v6, v8, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v10, 1.0 -; SI-NEXT: v_div_scale_f32 v13, s[4:5], v10, v10, 1.0 -; SI-NEXT: v_rcp_f32_e32 v14, v13 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v6, 1.0 +; SI-NEXT: v_div_scale_f32 v9, s[10:11], v6, v6, 1.0 +; SI-NEXT: v_rcp_f32_e32 v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 -; SI-NEXT: v_fma_f32 v14, v15, v14, v14 -; SI-NEXT: v_mul_f32_e32 v15, v12, v14 -; SI-NEXT: v_fma_f32 v16, -v13, v15, v12 -; SI-NEXT: v_fma_f32 v15, v16, v14, v15 -; SI-NEXT: v_fma_f32 v12, -v13, v15, v12 +; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; SI-NEXT: v_fma_f32 v10, v11, v10, v10 +; SI-NEXT: v_mul_f32_e32 v11, v8, v10 +; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v11, v12, v10, v11 +; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v12, v12, v14, v15 -; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; SI-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_27 +; SI-NEXT: ; %bb.24: ; %frem.loop_body27.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_21: ; %frem.loop_body27 +; SI-NEXT: .LBB10_25: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v13, v11 -; SI-NEXT: v_mul_f32_e32 v11, v13, v12 -; SI-NEXT: v_rndne_f32_e32 v11, v11 -; SI-NEXT: v_fma_f32 v11, -v11, v10, v13 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; SI-NEXT: v_add_f32_e32 v14, v11, v10 -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v11, 11 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_mul_f32_e32 v7, v9, v8 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v7, -v7, v6, v9 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; SI-NEXT: v_add_f32_e32 v10, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_21 -; SI-NEXT: ; %bb.22: ; %Flow125 -; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB10_23: ; %frem.loop_exit28 +; SI-NEXT: s_cbranch_scc1 .LBB10_25 +; SI-NEXT: ; %bb.26: ; %Flow125 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: .LBB10_27: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1 -; SI-NEXT: v_mul_f32_e32 v12, v11, v12 -; SI-NEXT: v_rndne_f32_e32 v12, v12 -; SI-NEXT: v_fma_f32 v11, -v12, v10, v11 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; SI-NEXT: v_add_f32_e32 v10, v11, v10 -; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v10, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v10, s0, v10, v2 -; SI-NEXT: .LBB10_24: -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v12, |v11| -; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13 -; SI-NEXT: s_cbranch_vccz .LBB10_26 -; SI-NEXT: ; %bb.25: ; %frem.else -; SI-NEXT: v_and_b32_e32 v14, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, s1 +; SI-NEXT: v_mul_f32_e32 v8, v7, v8 +; SI-NEXT: v_rndne_f32_e32 v8, v8 +; SI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; SI-NEXT: v_add_f32_e32 v6, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; SI-NEXT: s_and_b32 s0, s2, 0xffff8000 +; SI-NEXT: v_or_b32_e32 v6, s0, v6 +; SI-NEXT: .LBB10_28: +; SI-NEXT: v_cvt_f32_f16_e64 v8, |s4| +; SI-NEXT: v_cvt_f32_f16_e64 v7, |v2| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 +; SI-NEXT: s_cbranch_vccz .LBB10_31 +; SI-NEXT: ; %bb.29: ; %frem.else +; SI-NEXT: s_and_b32 s6, s4, 0x8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s4 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_27 -; SI-NEXT: s_branch .LBB10_32 -; SI-NEXT: .LBB10_26: -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_cbranch_execz .LBB10_32 +; SI-NEXT: ; %bb.30: +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_branch .LBB10_37 +; SI-NEXT: .LBB10_31: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_27: ; %frem.compute -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 +; SI-NEXT: .LBB10_32: ; %frem.compute +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v8|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v8 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v11 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v11, v12 -; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; SI-NEXT: v_ldexp_f32_e64 v12, v11, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v13|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v11, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v13, v13 +; SI-NEXT: v_readfirstlane_b32 s0, v9 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, v7, v9, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v7 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v13 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v11, v11, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v7 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v7, v9, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v11, 1.0 -; SI-NEXT: v_div_scale_f32 v14, s[4:5], v11, v11, 1.0 -; SI-NEXT: v_rcp_f32_e32 v15, v14 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v7, 1.0 +; SI-NEXT: v_div_scale_f32 v10, s[10:11], v7, v7, 1.0 +; SI-NEXT: v_rcp_f32_e32 v11, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 -; SI-NEXT: v_fma_f32 v15, v16, v15, v15 -; SI-NEXT: v_mul_f32_e32 v16, v13, v15 -; SI-NEXT: v_fma_f32 v17, -v14, v16, v13 -; SI-NEXT: v_fma_f32 v16, v17, v15, v16 -; SI-NEXT: v_fma_f32 v13, -v14, v16, v13 +; SI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; SI-NEXT: v_fma_f32 v11, v12, v11, v11 +; SI-NEXT: v_mul_f32_e32 v12, v9, v11 +; SI-NEXT: v_fma_f32 v13, -v10, v12, v9 +; SI-NEXT: v_fma_f32 v12, v13, v11, v12 +; SI-NEXT: v_fma_f32 v9, -v10, v12, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v13, v13, v15, v16 -; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; SI-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; SI-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_36 +; SI-NEXT: ; %bb.33: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_29: ; %frem.loop_body +; SI-NEXT: .LBB10_34: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mul_f32_e32 v12, v14, v13 -; SI-NEXT: v_rndne_f32_e32 v12, v12 -; SI-NEXT: v_fma_f32 v12, -v12, v11, v14 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; SI-NEXT: v_add_f32_e32 v15, v12, v11 -; SI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; SI-NEXT: v_ldexp_f32_e64 v12, v12, 11 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mul_f32_e32 v8, v10, v9 +; SI-NEXT: v_rndne_f32_e32 v8, v8 +; SI-NEXT: v_fma_f32 v8, -v8, v7, v10 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; SI-NEXT: v_add_f32_e32 v11, v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_29 -; SI-NEXT: ; %bb.30: ; %Flow -; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB10_31: ; %frem.loop_exit +; SI-NEXT: s_cbranch_scc1 .LBB10_34 +; SI-NEXT: ; %bb.35: ; %Flow +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: .LBB10_36: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1 -; SI-NEXT: v_mul_f32_e32 v13, v12, v13 -; SI-NEXT: v_rndne_f32_e32 v13, v13 -; SI-NEXT: v_fma_f32 v12, -v13, v11, v12 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; SI-NEXT: v_add_f32_e32 v11, v12, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v11, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v11, s0, v11, v0 -; SI-NEXT: .LBB10_32: ; %Flow124 +; SI-NEXT: v_ldexp_f32_e64 v8, v8, s1 +; SI-NEXT: v_mul_f32_e32 v9, v8, v9 +; SI-NEXT: v_rndne_f32_e32 v9, v9 +; SI-NEXT: v_fma_f32 v8, -v9, v7, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; SI-NEXT: v_add_f32_e32 v7, v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; SI-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 +; SI-NEXT: v_and_b32_e32 v7, 0x7fff, v7 +; SI-NEXT: s_and_b32 s0, s4, 0x8000 +; SI-NEXT: v_or_b32_e32 v7, s0, v7 +; SI-NEXT: .LBB10_37: ; %Flow124 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v4 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v5, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v1 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v6, vcc ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v2 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s10 -; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 -; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; CI-NEXT: s_mov_b32 s0, s10 +; CI-NEXT: s_mov_b32 s1, s11 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7| -; CI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v6 -; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] -; CI-NEXT: s_cbranch_vccz .LBB10_2 +; CI-NEXT: v_readfirstlane_b32 s2, v1 +; CI-NEXT: v_readfirstlane_b32 s3, v0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3| +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v0| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB10_3 ; CI-NEXT: ; %bb.1: ; %frem.else86 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_and_b32_e32 v11, 0x80000000, v6 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc -; CI-NEXT: s_cbranch_execz .LBB10_3 -; CI-NEXT: s_branch .LBB10_8 -; CI-NEXT: .LBB10_2: -; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB10_3: ; %frem.compute85 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 -; CI-NEXT: v_frexp_mant_f32_e32 v8, v10 -; CI-NEXT: v_frexp_mant_f32_e32 v10, v9 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v9 -; CI-NEXT: v_ldexp_f32_e64 v9, v10, 1 -; CI-NEXT: v_div_scale_f32 v15, s[0:1], v9, v9, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v11, v8, 11 -; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v14 -; CI-NEXT: v_not_b32_e32 v10, v8 -; CI-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 -; CI-NEXT: v_rcp_f32_e32 v16, v15 +; CI-NEXT: s_and_b32 s4, s3, 0xffff8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s4, s4, s3 +; CI-NEXT: s_cbranch_execz .LBB10_4 +; CI-NEXT: ; %bb.2: +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: s_branch .LBB10_9 +; CI-NEXT: .LBB10_3: +; CI-NEXT: ; implicit-def: $sgpr4 +; CI-NEXT: .LBB10_4: ; %frem.compute85 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v3 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v3, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v2 +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 -; CI-NEXT: v_fma_f32 v16, v17, v16, v16 -; CI-NEXT: v_mul_f32_e32 v17, v12, v16 -; CI-NEXT: v_fma_f32 v18, -v15, v17, v12 -; CI-NEXT: v_fma_f32 v17, v18, v16, v17 -; CI-NEXT: v_fma_f32 v12, -v15, v17, v12 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 -; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader -; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 -; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10 -; CI-NEXT: .LBB10_5: ; %frem.loop_body93 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_8 +; CI-NEXT: ; %bb.5: ; %frem.loop_body93.preheader +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v4 +; CI-NEXT: .LBB10_6: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v13, v11 -; CI-NEXT: v_mul_f32_e32 v11, v13, v12 -; CI-NEXT: v_rndne_f32_e32 v11, v11 -; CI-NEXT: v_fma_f32 v11, -v11, v9, v13 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; CI-NEXT: v_add_f32_e32 v14, v11, v9 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v10 -; CI-NEXT: v_ldexp_f32_e64 v11, v11, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_5 -; CI-NEXT: ; %bb.6: ; %Flow133 -; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 -; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10 -; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 -; CI-NEXT: v_mul_f32_e32 v11, v10, v12 -; CI-NEXT: v_rndne_f32_e32 v11, v11 -; CI-NEXT: v_fma_f32 v10, -v11, v9, v10 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; CI-NEXT: v_add_f32_e32 v9, v10, v9 -; CI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_bfi_b32 v8, s0, v8, v6 -; CI-NEXT: .LBB10_8: -; CI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v11, |v9| -; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 -; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else53 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_and_b32_e32 v12, 0x80000000, v4 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 -; CI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; CI-NEXT: s_cbranch_execz .LBB10_11 -; CI-NEXT: s_branch .LBB10_16 -; CI-NEXT: .LBB10_10: -; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB10_11: ; %frem.compute52 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 -; CI-NEXT: v_frexp_mant_f32_e32 v9, v11 -; CI-NEXT: v_frexp_mant_f32_e32 v11, v10 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v10 -; CI-NEXT: v_ldexp_f32_e64 v10, v11, 1 -; CI-NEXT: v_div_scale_f32 v16, s[0:1], v10, v10, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v12, v9, 11 -; CI-NEXT: v_add_i32_e32 v9, vcc, -1, v15 -; CI-NEXT: v_not_b32_e32 v11, v9 -; CI-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 -; CI-NEXT: v_rcp_f32_e32 v17, v16 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_6 +; CI-NEXT: ; %bb.7: ; %Flow133 +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB10_8: ; %frem.loop_exit94 +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_and_b32 s0, s3, 0xffff8000 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v3, s0, v2 +; CI-NEXT: .LBB10_9: ; %Flow136 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: ; %bb.10: +; CI-NEXT: v_cvt_f32_f16_e64 v6, |s5| +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v4| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; CI-NEXT: s_cbranch_vccz .LBB10_13 +; CI-NEXT: ; %bb.11: ; %frem.else53 +; CI-NEXT: s_and_b32 s6, s5, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s5 +; CI-NEXT: s_cbranch_execz .LBB10_14 +; CI-NEXT: ; %bb.12: +; CI-NEXT: v_mov_b32_e32 v5, s6 +; CI-NEXT: s_branch .LBB10_19 +; CI-NEXT: .LBB10_13: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_14: ; %frem.compute52 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v6 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v6 +; CI-NEXT: v_ldexp_f32_e64 v8, v6, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1 +; CI-NEXT: v_div_scale_f32 v12, s[0:1], v6, v6, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v11 +; CI-NEXT: v_not_b32_e32 v7, v5 +; CI-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; CI-NEXT: v_rcp_f32_e32 v13, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v18, -v16, v17, 1.0 -; CI-NEXT: v_fma_f32 v17, v18, v17, v17 -; CI-NEXT: v_mul_f32_e32 v18, v13, v17 -; CI-NEXT: v_fma_f32 v19, -v16, v18, v13 -; CI-NEXT: v_fma_f32 v18, v19, v17, v18 -; CI-NEXT: v_fma_f32 v13, -v16, v18, v13 +; CI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; CI-NEXT: v_fma_f32 v13, v14, v13, v13 +; CI-NEXT: v_mul_f32_e32 v14, v9, v13 +; CI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; CI-NEXT: v_fma_f32 v14, v15, v13, v14 +; CI-NEXT: v_fma_f32 v9, -v12, v14, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v13, v13, v17, v18 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 -; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader -; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 -; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11 -; CI-NEXT: .LBB10_13: ; %frem.loop_body60 +; CI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 +; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_18 +; CI-NEXT: ; %bb.15: ; %frem.loop_body60.preheader +; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 +; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7 +; CI-NEXT: .LBB10_16: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v14, v12 -; CI-NEXT: v_mul_f32_e32 v12, v14, v13 -; CI-NEXT: v_rndne_f32_e32 v12, v12 -; CI-NEXT: v_fma_f32 v12, -v12, v10, v14 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; CI-NEXT: v_add_f32_e32 v15, v12, v10 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v11 -; CI-NEXT: v_ldexp_f32_e64 v12, v12, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_13 -; CI-NEXT: ; %bb.14: ; %Flow129 -; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 -; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11 -; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 -; CI-NEXT: v_mul_f32_e32 v12, v11, v13 -; CI-NEXT: v_rndne_f32_e32 v12, v12 -; CI-NEXT: v_fma_f32 v11, -v12, v10, v11 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; CI-NEXT: v_add_f32_e32 v10, v11, v10 -; CI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_bfi_b32 v9, s0, v9, v4 -; CI-NEXT: .LBB10_16: -; CI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v12, |v10| -; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11 -; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else20 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_and_b32_e32 v13, 0x80000000, v2 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v11 -; CI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; CI-NEXT: s_cbranch_execz .LBB10_19 -; CI-NEXT: s_branch .LBB10_24 -; CI-NEXT: .LBB10_18: -; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB10_19: ; %frem.compute19 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12 -; CI-NEXT: v_frexp_mant_f32_e32 v10, v12 -; CI-NEXT: v_frexp_mant_f32_e32 v12, v11 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v11 -; CI-NEXT: v_ldexp_f32_e64 v11, v12, 1 -; CI-NEXT: v_div_scale_f32 v17, s[0:1], v11, v11, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v13, v10, 11 -; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v16 -; CI-NEXT: v_not_b32_e32 v12, v10 -; CI-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CI-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 -; CI-NEXT: v_rcp_f32_e32 v18, v17 +; CI-NEXT: v_mov_b32_e32 v10, v8 +; CI-NEXT: v_mul_f32_e32 v8, v10, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v11, v8, v6 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_16 +; CI-NEXT: ; %bb.17: ; %Flow129 +; CI-NEXT: v_mov_b32_e32 v8, v10 +; CI-NEXT: .LBB10_18: ; %frem.loop_exit61 +; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7 +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 +; CI-NEXT: v_mul_f32_e32 v8, v7, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v6, v7, v6 +; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: s_and_b32 s0, s5, 0x8000 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; CI-NEXT: v_or_b32_e32 v5, s0, v5 +; CI-NEXT: .LBB10_19: +; CI-NEXT: v_cvt_f32_f16_e64 v7, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; CI-NEXT: s_cbranch_vccz .LBB10_22 +; CI-NEXT: ; %bb.20: ; %frem.else20 +; CI-NEXT: s_and_b32 s6, s2, 0xffff8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s2 +; CI-NEXT: s_cbranch_execz .LBB10_23 +; CI-NEXT: ; %bb.21: +; CI-NEXT: v_mov_b32_e32 v6, s6 +; CI-NEXT: s_branch .LBB10_28 +; CI-NEXT: .LBB10_22: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_23: ; %frem.compute19 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v7 +; CI-NEXT: v_frexp_mant_f32_e32 v7, v7 +; CI-NEXT: v_ldexp_f32_e64 v9, v7, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 1 +; CI-NEXT: v_div_scale_f32 v13, s[0:1], v7, v7, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v12, v6 +; CI-NEXT: v_add_i32_e32 v6, vcc, -1, v12 +; CI-NEXT: v_not_b32_e32 v8, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v7, 1.0 +; CI-NEXT: v_rcp_f32_e32 v14, v13 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v19, -v17, v18, 1.0 -; CI-NEXT: v_fma_f32 v18, v19, v18, v18 -; CI-NEXT: v_mul_f32_e32 v19, v14, v18 -; CI-NEXT: v_fma_f32 v20, -v17, v19, v14 -; CI-NEXT: v_fma_f32 v19, v20, v18, v19 -; CI-NEXT: v_fma_f32 v14, -v17, v19, v14 +; CI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; CI-NEXT: v_fma_f32 v14, v15, v14, v14 +; CI-NEXT: v_mul_f32_e32 v15, v10, v14 +; CI-NEXT: v_fma_f32 v16, -v13, v15, v10 +; CI-NEXT: v_fma_f32 v15, v16, v14, v15 +; CI-NEXT: v_fma_f32 v10, -v13, v15, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v14, v14, v18, v19 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12 -; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader -; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 -; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12 -; CI-NEXT: .LBB10_21: ; %frem.loop_body27 +; CI-NEXT: v_div_fmas_f32 v10, v10, v14, v15 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8 +; CI-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_27 +; CI-NEXT: ; %bb.24: ; %frem.loop_body27.preheader +; CI-NEXT: v_sub_i32_e32 v8, vcc, v11, v12 +; CI-NEXT: v_add_i32_e32 v8, vcc, 11, v8 +; CI-NEXT: .LBB10_25: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v15, v13 -; CI-NEXT: v_mul_f32_e32 v13, v15, v14 -; CI-NEXT: v_rndne_f32_e32 v13, v13 -; CI-NEXT: v_fma_f32 v13, -v13, v11, v15 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 -; CI-NEXT: v_add_f32_e32 v16, v13, v11 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc -; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v12 -; CI-NEXT: v_ldexp_f32_e64 v13, v13, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_21 -; CI-NEXT: ; %bb.22: ; %Flow125 -; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 -; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12 -; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 -; CI-NEXT: v_mul_f32_e32 v13, v12, v14 -; CI-NEXT: v_rndne_f32_e32 v13, v13 -; CI-NEXT: v_fma_f32 v12, -v13, v11, v12 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; CI-NEXT: v_add_f32_e32 v11, v12, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_bfi_b32 v10, s0, v10, v2 -; CI-NEXT: .LBB10_24: -; CI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v13, |v11| -; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 -; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_and_b32_e32 v14, 0x80000000, v0 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; CI-NEXT: s_cbranch_execz .LBB10_27 -; CI-NEXT: s_branch .LBB10_32 -; CI-NEXT: .LBB10_26: -; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB10_27: ; %frem.compute -; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 -; CI-NEXT: v_frexp_mant_f32_e32 v11, v13 -; CI-NEXT: v_frexp_mant_f32_e32 v13, v12 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v17, v12 -; CI-NEXT: v_ldexp_f32_e64 v12, v13, 1 -; CI-NEXT: v_div_scale_f32 v18, s[0:1], v12, v12, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v14, v11, 11 -; CI-NEXT: v_add_i32_e32 v11, vcc, -1, v17 -; CI-NEXT: v_not_b32_e32 v13, v11 -; CI-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 -; CI-NEXT: v_rcp_f32_e32 v19, v18 +; CI-NEXT: v_mov_b32_e32 v11, v9 +; CI-NEXT: v_mul_f32_e32 v9, v11, v10 +; CI-NEXT: v_rndne_f32_e32 v9, v9 +; CI-NEXT: v_fma_f32 v9, -v9, v7, v11 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; CI-NEXT: v_add_f32_e32 v12, v9, v7 +; CI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; CI-NEXT: v_add_i32_e32 v8, vcc, -11, v8 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v8 +; CI-NEXT: v_ldexp_f32_e64 v9, v9, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_25 +; CI-NEXT: ; %bb.26: ; %Flow125 +; CI-NEXT: v_mov_b32_e32 v9, v11 +; CI-NEXT: .LBB10_27: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v8, vcc, -10, v8 +; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 +; CI-NEXT: v_mul_f32_e32 v9, v8, v10 +; CI-NEXT: v_rndne_f32_e32 v9, v9 +; CI-NEXT: v_fma_f32 v8, -v9, v7, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v7, v8, v7 +; CI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: s_and_b32 s0, s2, 0xffff8000 +; CI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; CI-NEXT: v_or_b32_e32 v6, s0, v6 +; CI-NEXT: .LBB10_28: +; CI-NEXT: v_cvt_f32_f16_e64 v8, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v2| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 +; CI-NEXT: s_cbranch_vccz .LBB10_31 +; CI-NEXT: ; %bb.29: ; %frem.else +; CI-NEXT: s_and_b32 s6, s4, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s4 +; CI-NEXT: s_cbranch_execz .LBB10_32 +; CI-NEXT: ; %bb.30: +; CI-NEXT: v_mov_b32_e32 v7, s6 +; CI-NEXT: s_branch .LBB10_37 +; CI-NEXT: .LBB10_31: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_32: ; %frem.compute +; CI-NEXT: v_frexp_exp_i32_f32_e32 v12, v8 +; CI-NEXT: v_frexp_mant_f32_e32 v8, v8 +; CI-NEXT: v_ldexp_f32_e64 v10, v8, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v8, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; CI-NEXT: v_div_scale_f32 v14, s[0:1], v8, v8, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v7 +; CI-NEXT: v_add_i32_e32 v7, vcc, -1, v13 +; CI-NEXT: v_not_b32_e32 v9, v7 +; CI-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v8, 1.0 +; CI-NEXT: v_rcp_f32_e32 v15, v14 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 -; CI-NEXT: v_fma_f32 v19, v20, v19, v19 -; CI-NEXT: v_mul_f32_e32 v20, v15, v19 -; CI-NEXT: v_fma_f32 v21, -v18, v20, v15 -; CI-NEXT: v_fma_f32 v20, v21, v19, v20 -; CI-NEXT: v_fma_f32 v15, -v18, v20, v15 +; CI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; CI-NEXT: v_fma_f32 v15, v16, v15, v15 +; CI-NEXT: v_mul_f32_e32 v16, v11, v15 +; CI-NEXT: v_fma_f32 v17, -v14, v16, v11 +; CI-NEXT: v_fma_f32 v16, v17, v15, v16 +; CI-NEXT: v_fma_f32 v11, -v14, v16, v11 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 -; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader -; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 -; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13 -; CI-NEXT: .LBB10_29: ; %frem.loop_body +; CI-NEXT: v_div_fmas_f32 v11, v11, v15, v16 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 +; CI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_36 +; CI-NEXT: ; %bb.33: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v9, vcc, v12, v13 +; CI-NEXT: v_add_i32_e32 v9, vcc, 11, v9 +; CI-NEXT: .LBB10_34: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v16, v14 -; CI-NEXT: v_mul_f32_e32 v14, v16, v15 -; CI-NEXT: v_rndne_f32_e32 v14, v14 -; CI-NEXT: v_fma_f32 v14, -v14, v12, v16 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 -; CI-NEXT: v_add_f32_e32 v17, v14, v12 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc -; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v13 -; CI-NEXT: v_ldexp_f32_e64 v14, v14, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_29 -; CI-NEXT: ; %bb.30: ; %Flow -; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit -; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13 -; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 -; CI-NEXT: v_mul_f32_e32 v14, v13, v15 -; CI-NEXT: v_rndne_f32_e32 v14, v14 -; CI-NEXT: v_fma_f32 v13, -v14, v12, v13 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 -; CI-NEXT: v_add_f32_e32 v12, v13, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_bfi_b32 v11, s0, v11, v0 -; CI-NEXT: .LBB10_32: ; %Flow124 +; CI-NEXT: v_mov_b32_e32 v12, v10 +; CI-NEXT: v_mul_f32_e32 v10, v12, v11 +; CI-NEXT: v_rndne_f32_e32 v10, v10 +; CI-NEXT: v_fma_f32 v10, -v10, v8, v12 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; CI-NEXT: v_add_f32_e32 v13, v10, v8 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CI-NEXT: v_add_i32_e32 v9, vcc, -11, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v9 +; CI-NEXT: v_ldexp_f32_e64 v10, v10, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_34 +; CI-NEXT: ; %bb.35: ; %Flow +; CI-NEXT: v_mov_b32_e32 v10, v12 +; CI-NEXT: .LBB10_36: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v9, vcc, -10, v9 +; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 +; CI-NEXT: v_mul_f32_e32 v10, v9, v11 +; CI-NEXT: v_rndne_f32_e32 v10, v10 +; CI-NEXT: v_fma_f32 v9, -v10, v8, v9 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; CI-NEXT: v_add_f32_e32 v8, v9, v8 +; CI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: s_mov_b32 s2, 0x7f800000 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_and_b32_e32 v7, 0x7fff, v7 +; CI-NEXT: v_or_b32_e32 v7, s0, v7 +; CI-NEXT: .LBB10_37: ; %Flow124 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; CI-NEXT: s_mov_b32 s3, 0x7f800000 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v4 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v0, v5, vcc ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v1 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v1, v0, v6, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v1, v2, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v7, v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index d4bddf26d0ed3..4b7e08f814b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9207,14 +9207,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9250,14 +9248,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -9726,14 +9722,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9770,14 +9764,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10250,14 +10242,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10294,14 +10284,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10746,23 +10734,21 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -10788,24 +10774,22 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -11249,14 +11233,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11292,14 +11274,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11757,14 +11737,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11800,14 +11778,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12160,8 +12136,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12195,8 +12169,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12533,10 +12505,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12567,10 +12537,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13041,14 +13009,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13085,14 +13051,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13552,14 +13516,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13595,14 +13557,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19572,49 +19532,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -19626,51 +19576,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB64_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -19826,49 +19765,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19880,51 +19809,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB65_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20081,53 +19999,43 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20139,55 +20047,44 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB66_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20325,41 +20222,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20375,47 +20263,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB67_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -20554,41 +20432,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB68_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20604,47 +20473,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB68_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20784,45 +20643,36 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB69_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20838,51 +20688,41 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB69_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21042,49 +20882,39 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21096,51 +20926,40 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB70_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21283,41 +21102,32 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB71_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21333,47 +21143,37 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB71_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21543,49 +21343,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB72_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -21597,51 +21387,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB72_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -21804,41 +21583,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB73_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21854,47 +21624,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB73_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -22049,49 +21809,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22103,51 +21853,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB74_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -22284,41 +22023,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22334,47 +22064,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB75_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void @@ -22543,49 +22263,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: @@ -22597,51 +22307,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB76_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -22804,41 +22503,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22854,47 +22544,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB77_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index bcf51f89920c0..041a77c960f04 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4777,14 +4777,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4820,14 +4818,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5233,14 +5229,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5277,14 +5271,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5693,14 +5685,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5737,14 +5727,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6125,23 +6113,21 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6167,24 +6153,22 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -6569,14 +6553,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6612,14 +6594,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7016,14 +6996,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7059,14 +7037,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7374,8 +7350,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7409,8 +7383,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7705,10 +7677,8 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7739,10 +7709,8 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8151,14 +8119,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8195,14 +8161,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8603,14 +8567,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8646,14 +8608,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13773,49 +13733,39 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13827,51 +13777,40 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14076,49 +14015,39 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14130,51 +14059,40 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14380,53 +14298,43 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14438,55 +14346,44 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14684,41 +14581,32 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14734,47 +14622,37 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -14973,41 +14851,32 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15023,47 +14892,37 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15263,45 +15122,36 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15317,51 +15167,41 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15570,49 +15410,39 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15624,51 +15454,40 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15871,41 +15690,32 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15921,47 +15731,37 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 9406e08e9e412..e13a16b762d6d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4777,14 +4777,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4820,14 +4818,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5233,14 +5229,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5277,14 +5271,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5693,14 +5685,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5737,14 +5727,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6125,23 +6113,21 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6167,24 +6153,22 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -6569,14 +6553,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6612,14 +6594,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7016,14 +6996,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7059,14 +7037,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7374,8 +7350,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7409,8 +7383,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7705,10 +7677,8 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7739,10 +7709,8 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8151,14 +8119,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8195,14 +8161,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8603,14 +8567,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8646,14 +8608,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13773,49 +13733,39 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13827,51 +13777,40 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14076,49 +14015,39 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14130,51 +14059,40 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14380,53 +14298,43 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14438,55 +14346,44 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14684,41 +14581,32 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14734,47 +14622,37 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -14973,41 +14851,32 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15023,47 +14892,37 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15263,45 +15122,36 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15317,51 +15167,41 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15570,49 +15410,39 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15624,51 +15454,40 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15871,41 +15690,32 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15921,47 +15731,37 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index f4b7280062bb8..0229a482ca17b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5513,14 +5513,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5556,14 +5554,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5944,14 +5940,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5988,14 +5982,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6379,14 +6371,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6423,14 +6413,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6791,23 +6779,21 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6833,24 +6819,22 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -7210,14 +7194,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7253,14 +7235,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7632,14 +7612,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7675,14 +7653,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7970,8 +7946,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8005,8 +7979,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8281,10 +8253,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8315,10 +8285,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8702,14 +8670,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8746,14 +8712,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9129,14 +9093,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9172,14 +9134,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14262,49 +14222,39 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -14316,51 +14266,40 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -14548,49 +14487,39 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14602,51 +14531,40 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14835,53 +14753,43 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -14893,55 +14801,44 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15120,41 +15017,32 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15170,47 +15058,37 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -15390,41 +15268,32 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15440,47 +15309,37 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15661,45 +15520,36 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15715,51 +15565,41 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15951,49 +15791,39 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -16005,51 +15835,40 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -16233,41 +16052,32 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16283,47 +16093,37 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 5154ba95aec78..8048bf6b6e4e5 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -404,11 +404,11 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -460,12 +460,11 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s6, s1, 16 -; CI-NEXT: s_lshr_b32 s7, s0, 16 -; CI-NEXT: s_lshr_b32 s8, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; CI-NEXT: s_lshr_b32 s6, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; CI-NEXT: s_lshr_b32 s7, s3, 16 +; CI-NEXT: s_lshr_b32 s8, s0, 16 +; CI-NEXT: s_lshr_b32 s9, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 @@ -473,7 +472,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -652,53 +652,29 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 } define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; CI-LABEL: extload_v3f16_to_v3f64_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_add_u32 s2, s0, 16 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_mov_b32_e32 v6, s2 -; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v3f16_to_v3f64_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v3f16_to_v3f64_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CIVI-NEXT: s_add_u32 s2, s0, 16 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; CIVI-NEXT: s_addc_u32 s3, s1, 0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CIVI-NEXT: v_mov_b32_e32 v7, s3 +; CIVI-NEXT: v_mov_b32_e32 v6, s2 +; CIVI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] +; CIVI-NEXT: v_mov_b32_e32 v5, s1 +; CIVI-NEXT: v_mov_b32_e32 v4, s0 +; CIVI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: @@ -815,37 +791,37 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 ; CI-NEXT: s_lshr_b32 s6, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s7, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s8, s1, 16 -; CI-NEXT: s_lshr_b32 s6, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; CI-NEXT: s_add_u32 s0, s4, 48 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: s_add_u32 s0, s4, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; CI-NEXT: s_addc_u32 s1, s5, 0 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v9, s1 @@ -1134,12 +1110,12 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1205,14 +1181,14 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; CI-NEXT: s_endpgm ; @@ -1280,14 +1256,14 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1368,18 +1344,18 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -1473,61 +1449,61 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -1917,21 +1893,21 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; CI-NEXT: v_mov_b32_e32 v11, s3 -; CI-NEXT: v_mov_b32_e32 v10, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; CI-NEXT: s_endpgm @@ -2045,26 +2021,26 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 ; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 ; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] @@ -2202,91 +2178,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v18, s3 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v12, s1 +; CI-NEXT: v_mov_b32_e32 v11, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; CI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v20 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v22 +; CI-NEXT: v_mov_b32_e32 v18, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -3146,11 +3123,11 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3213,14 +3190,14 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: s_lshr_b32 s2, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v1, v2, v3 +; CI-NEXT: v_add_f32_e32 v1, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v2, v0, v1 @@ -3276,30 +3253,30 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_add_f32_e32 v7, v7, v9 -; CI-NEXT: v_add_f32_e32 v6, v6, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_add_f32_e32 v2, v7, v9 +; CI-NEXT: v_add_f32_e32 v3, v6, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v3 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; CI-NEXT: s_endpgm ; @@ -3350,58 +3327,58 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s10, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 -; CI-NEXT: s_lshr_b32 s0, s5, 16 -; CI-NEXT: s_lshr_b32 s11, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: s_lshr_b32 s13, s3, 16 +; CI-NEXT: s_lshr_b32 s14, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s13 ; CI-NEXT: s_lshr_b32 s12, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 -; CI-NEXT: s_lshr_b32 s10, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 -; CI-NEXT: s_lshr_b32 s0, s7, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; CI-NEXT: v_add_f32_e32 v1, v1, v9 -; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: v_add_f32_e32 v3, v3, v11 -; CI-NEXT: v_add_f32_e32 v2, v2, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v5, v5, v13 +; CI-NEXT: s_lshr_b32 s15, s6, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s15 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s12 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v4, v4, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_add_f32_e32 v7, v7, v14 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v2, v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_lshr_b32 s14, s5, 16 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_add_f32_e32 v6, v6, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_or_b32_e32 v3, v1, v0 +; CI-NEXT: v_add_f32_e32 v1, v5, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s1 +; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: s_lshr_b32 s13, s4, 16 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 +; CI-NEXT: v_add_f32_e32 v0, v5, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; CI-NEXT: v_add_f32_e32 v1, v7, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_add_f32_e32 v5, v7, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v5, v1 -; CI-NEXT: v_or_b32_e32 v0, v4, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v5, v0 ; CI-NEXT: v_mov_b32_e32 v4, s8 -; CI-NEXT: v_or_b32_e32 v3, v7, v3 -; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 92ea83fdfb982..c24ee53de2e76 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2480,10 +2480,10 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s1, s4, 0xffff -; CI-NEXT: s_lshl_b32 s2, s4, 16 +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: s_and_b32 s2, s4, 0xffff ; CI-NEXT: s_lshl_b32 s3, s5, 4 -; CI-NEXT: s_or_b32 s2, s1, s2 +; CI-NEXT: s_or_b32 s2, s2, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 @@ -2839,60 +2839,48 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: s_cmp_eq_u32 s5, 6 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v6, s4 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: s_cmp_eq_u32 s5, 7 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 5 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc +; CI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_or_b32_e32 v3, v7, v3 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; CI-NEXT: v_or_b32_e32 v2, v2, v7 +; CI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v3, v3, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; CI-NEXT: v_or_b32_e32 v2, v2, v7 -; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v1, v1, v7 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -3425,7 +3413,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -3440,119 +3428,94 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CI-NEXT: s_cmp_eq_u32 s5, 15 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_add_i32_e32 v11, vcc, 16, v4 +; CI-NEXT: s_cmp_eq_u32 s7, 14 +; CI-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v6, s6 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 14 -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 13 -; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 12 +; CI-NEXT: s_cmp_eq_u32 s7, 15 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] +; CI-NEXT: v_cndmask_b32_e32 v13, v10, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 10 +; CI-NEXT: s_cmp_eq_u32 s7, 12 +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 13 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v8, v8, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 -; CI-NEXT: s_cmp_eq_u32 s5, 9 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 10 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 11 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 8 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; CI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v15, v15, v6, s[2:3] +; CI-NEXT: s_cmp_eq_u32 s7, 9 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 6 +; CI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] +; CI-NEXT: v_or_b32_e32 v8, v8, v15 +; CI-NEXT: v_cndmask_b32_e32 v15, v16, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_cmp_eq_u32 s7, 7 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; CI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_cndmask_b32_e64 v14, v14, v6, s[0:1] +; CI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 4 +; CI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; CI-NEXT: v_or_b32_e32 v10, v13, v10 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; CI-NEXT: v_or_b32_e32 v7, v7, v15 +; CI-NEXT: v_cndmask_b32_e32 v15, v17, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_cmp_eq_u32 s7, 5 +; CI-NEXT: v_or_b32_e32 v9, v9, v13 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 2 +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: s_cmp_eq_u32 s7, 3 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_cmp_eq_u32 s7, 1 +; CI-NEXT: v_or_b32_e32 v2, v2, v13 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc +; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v3, v3, v15 +; CI-NEXT: v_or_b32_e32 v1, v1, v14 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 769bf0a6458b2..4e6b0018f661b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -143,19 +143,19 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 -; GFX6-NEXT: v_fract_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 +; GFX6-NEXT: v_fract_f32_e32 v1, v1 +; GFX6-NEXT: v_cos_f32_e32 v1, v1 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 -; GFX6-NEXT: v_cos_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index ee01c9d0acdc7..1485e3f88f942 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5405,26 +5405,28 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; SI-SDAG-NEXT: s_mov_b32 s5, 0x32a5705f ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x32a5705f ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x42b17218 ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x42b17218 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5575,17 +5577,13 @@ define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_fast: @@ -5727,26 +5725,28 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_daz: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; SI-SDAG-NEXT: s_mov_b32 s5, 0x32a5705f ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x32a5705f ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x42b17218 ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x42b17218 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5880,23 +5880,14 @@ define half @v_exp_fneg_fabs_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_f16: ; R600: ; %bb.0: @@ -5932,23 +5923,14 @@ define half @v_exp_fneg_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_f16: ; R600: ; %bb.0: @@ -6552,8 +6534,8 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 @@ -6658,10 +6640,16 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 7d830a9306293..c4204417362a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5467,26 +5467,28 @@ define float @v_exp10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x40549a78 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x33979a37 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x33979a37 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc23369f4 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x421a209b ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x421a209b -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5640,20 +5642,16 @@ define float @v_exp10_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc217b818 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_f32_from_fpext_math_f16_fast: @@ -5798,26 +5796,28 @@ define float @v_exp10_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16_daz: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x40549a78 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x33979a37 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x33979a37 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc23369f4 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x421a209b ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x421a209b -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5951,23 +5951,14 @@ define half @v_exp10_fneg_fabs_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_fabs_f16: ; R600: ; %bb.0: @@ -6003,23 +5994,14 @@ define half @v_exp10_fneg_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_f16: ; R600: ; %bb.0: @@ -6691,8 +6673,8 @@ define <3 x half> @v_exp10_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 @@ -6839,11 +6821,23 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3a278000, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40548000, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a278000, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 @@ -6861,8 +6855,8 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 97ecb5362a4bc..21c7f56aa0816 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2759,18 +2759,12 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp2_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f32_from_fpext_math_f16: @@ -2976,22 +2970,13 @@ define half @v_exp2_fabs_f16(half %in) { } define half @v_exp2_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_fneg_fabs_f16: ; VI: ; %bb.0: @@ -3026,22 +3011,13 @@ define half @v_exp2_fneg_fabs_f16(half %in) { } define half @v_exp2_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_fneg_f16: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 97ea988581ce3..e8bf198f89855 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -16,29 +16,74 @@ define amdgpu_kernel void @fma_f16( ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v1 +; SI-NEXT: s_and_b32 s1, s0, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s1, v0 +; SI-NEXT: s_lshr_b32 s3, s0, 8 +; SI-NEXT: s_bfe_u32 s4, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s1, s3, 0xffe +; SI-NEXT: s_sub_i32 s3, 0x3f1, s4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s3, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s6, s3, s5 +; SI-NEXT: s_lshl_b32 s5, s6, s5 +; SI-NEXT: s_cmp_lg_u32 s5, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s4, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s4, 12 +; SI-NEXT: s_or_b32 s3, s6, s3 +; SI-NEXT: s_or_b32 s5, s1, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s5 +; SI-NEXT: s_and_b32 s5, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s3, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -112,29 +157,74 @@ define amdgpu_kernel void @fma_f16( define amdgpu_kernel void @fma_f16_imm_a( ; SI-LABEL: fma_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, s2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_a: @@ -195,29 +285,74 @@ define amdgpu_kernel void @fma_f16_imm_a( define amdgpu_kernel void @fma_f16_imm_b( ; SI-LABEL: fma_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, s2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_b: @@ -278,29 +413,74 @@ define amdgpu_kernel void @fma_f16_imm_b( define amdgpu_kernel void @fma_f16_imm_c( ; SI-LABEL: fma_f16_imm_c: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, v1, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_c: @@ -376,30 +556,119 @@ define amdgpu_kernel void @fma_v2f16( ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v0, v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, v3, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v6 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 +; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v1 +; SI-NEXT: s_and_b32 s1, s0, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s1, v0 +; SI-NEXT: s_lshr_b32 s3, s0, 8 +; SI-NEXT: s_bfe_u32 s4, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s1, s3, 0xffe +; SI-NEXT: s_sub_i32 s3, 0x3f1, s4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s3, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s7, s3, s6 +; SI-NEXT: s_lshl_b32 s6, s7, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s4, 0xfc10 +; SI-NEXT: s_lshl_b32 s6, s4, 12 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_or_b32 s6, s1, s6 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s6 +; SI-NEXT: s_and_b32 s6, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s3, s3, s6 +; SI-NEXT: v_fma_f64 v[2:3], v[10:11], v[8:9], v[6:7] +; SI-NEXT: s_cmp_lt_i32 s4, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_and_b32 s3, s5, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s3, v2 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s4, s5, 8 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s3, s4, 0xffe +; SI-NEXT: s_sub_i32 s4, 0x3f1, s6 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: v_med3_i32 v1, s4, 0, 13 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s7, s3, s4 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_lshl_b32 s4, s7, s4 +; SI-NEXT: s_cmp_lg_u32 s4, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_lshl_b32 s4, s6, 12 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_or_b32 s4, s1, s4 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s4, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s4, s4, s7 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_lshr_b32 s2, s5, 16 +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -481,37 +750,125 @@ define amdgpu_kernel void @fma_v2f16( define amdgpu_kernel void @fma_v2f16_imm_a( ; SI-LABEL: fma_v2f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], s[4:5], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_a: @@ -578,37 +935,125 @@ define amdgpu_kernel void @fma_v2f16_imm_a( define amdgpu_kernel void @fma_v2f16_imm_b( ; SI-LABEL: fma_v2f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], s[4:5], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_b: @@ -675,37 +1120,125 @@ define amdgpu_kernel void @fma_v2f16_imm_b( define amdgpu_kernel void @fma_v2f16_imm_c( ; SI-LABEL: fma_v2f16_imm_c: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, v2, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, v0, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_c: @@ -787,45 +1320,222 @@ define amdgpu_kernel void @fma_v4f16( ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dwordx2 v[5:6], off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 -; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v10 +; SI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; SI-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; SI-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; SI-NEXT: v_fma_f64 v[5:6], v[9:10], v[7:8], v[5:6] +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: v_fma_f64 v[2:3], v[15:16], v[13:14], v[11:12] +; SI-NEXT: s_and_b32 s3, s0, 0x1ff +; SI-NEXT: v_readfirstlane_b32 s1, v3 +; SI-NEXT: v_or_b32_e32 v3, s3, v5 +; SI-NEXT: s_lshr_b32 s4, s0, 8 +; SI-NEXT: s_bfe_u32 s5, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_b32 s3, s4, 0xffe +; SI-NEXT: s_sub_i32 s4, 0x3f1, s5 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_med3_i32 v5, s4, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_or_b32 s3, s3, s4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s12, s4, s7 +; SI-NEXT: s_lshl_b32 s7, s12, s7 +; SI-NEXT: s_cmp_lg_u32 s7, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s5, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s5, 12 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_or_b32 s7, s3, s7 +; SI-NEXT: s_cmp_lt_i32 s5, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s7 +; SI-NEXT: s_and_b32 s7, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s7, 5 +; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v17 +; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v18 +; SI-NEXT: v_cvt_f64_f32_e32 v[17:18], v19 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_add_i32 s4, s4, s7 +; SI-NEXT: v_fma_f64 v[7:8], v[17:18], v[9:10], v[7:8] +; SI-NEXT: s_cmp_lt_i32 s5, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s6, 0x1ff +; SI-NEXT: v_or_b32_e32 v3, s4, v7 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_lshr_b32 s5, s6, 8 +; SI-NEXT: s_bfe_u32 s7, s6, 0xb0014 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: s_and_b32 s4, s5, 0xffe +; SI-NEXT: s_sub_i32 s5, 0x3f1, s7 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: v_readfirstlane_b32 s3, v3 +; SI-NEXT: v_med3_i32 v5, s5, 0, 13 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, s5 +; SI-NEXT: s_cmp_lg_u32 s5, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s7, 12 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_or_b32 s5, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_add_i32 s4, s4, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s5, s1, 0x1ff +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_lshr_b32 s6, s1, 8 +; SI-NEXT: s_bfe_u32 s7, s1, 0xb0014 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: s_and_b32 s5, s6, 0xffe +; SI-NEXT: s_sub_i32 s6, 0x3f1, s7 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_med3_i32 v3, s6, 0, 13 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: s_or_b32 s3, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s5, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s5, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s5, s7, 12 +; SI-NEXT: s_or_b32 s5, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_fma_f32 v7, v7, v9, v11 -; SI-NEXT: v_fma_f32 v6, v6, v8, v10 -; SI-NEXT: v_fma_f32 v1, v1, v3, v5 -; SI-NEXT: v_fma_f32 v0, v0, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_add_i32 s4, s4, s5 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s3, v1 +; SI-NEXT: s_and_b32 s4, s3, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s4, s3, 8 +; SI-NEXT: s_bfe_u32 s6, s3, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0xffe +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_sub_i32 s7, 0x3f1, s6 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_med3_i32 v0, s7, 0, 13 +; SI-NEXT: s_or_b32 s5, s4, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_lshr_b32 s12, s5, s7 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, s7 +; SI-NEXT: s_cmp_lg_u32 s7, s5 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s6, 12 +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_or_b32 s7, s4, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s5, 7 +; SI-NEXT: s_cmp_gt_i32 s7, 5 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_lshr_b32 s5, s5, 2 +; SI-NEXT: s_add_i32 s5, s5, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s2, s2, s5 +; SI-NEXT: s_lshr_b32 s3, s3, 16 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 385d76bc42bda..a32a456c102dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -19,32 +19,33 @@ define amdgpu_kernel void @fmuladd_f16( ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -302,17 +303,20 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -534,17 +538,20 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -769,33 +776,39 @@ define amdgpu_kernel void @fmuladd_v2f16( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mac_f32_e32 v5, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_mac_f32_e32 v2, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 8c4d4788c4bdf..43cc632d3708b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -650,16 +650,16 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v5, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v5, v3 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -816,13 +816,13 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v5, 16, v2 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v4, v5 ; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -935,23 +935,23 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v6, v3 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v7, v5 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1134,26 +1134,26 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v7, 16, v3 ; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v5, v4 ; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v6, v7, v6 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v6, v6, v7 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 4e8ffdcb00310..7903ae93d770c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -5901,26 +5901,21 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_f32_from_fpext_math_f16: @@ -6427,24 +6422,14 @@ define half @v_log_fabs_f16(half %in) { } define half @v_log_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_fneg_fabs_f16: ; VI: ; %bb.0: @@ -6508,24 +6493,14 @@ define half @v_log_fneg_fabs_f16(half %in) { } define half @v_log_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_fneg_f16: ; VI: ; %bb.0: @@ -7571,27 +7546,27 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,27 +7748,27 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 843b829f28742..478580ff8ec0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -5901,26 +5901,21 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log10_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_f32_from_fpext_math_f16: @@ -6427,24 +6422,14 @@ define half @v_log10_fabs_f16(half %in) { } define half @v_log10_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log10_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_fneg_fabs_f16: ; VI: ; %bb.0: @@ -6508,24 +6493,14 @@ define half @v_log10_fneg_fabs_f16(half %in) { } define half @v_log10_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log10_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_fneg_f16: ; VI: ; %bb.0: @@ -7571,27 +7546,27 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,27 +7748,27 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 35ae1337d8e76..8401e05b39c19 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3525,17 +3525,12 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log2_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_f32_from_fpext_math_f16: @@ -3855,22 +3850,13 @@ define half @v_log2_fabs_f16(half %in) { } define half @v_log2_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log2_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_fneg_fabs_f16: ; VI: ; %bb.0: @@ -3924,22 +3910,13 @@ define half @v_log2_fneg_fabs_f16(half %in) { } define half @v_log2_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log2_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_fneg_f16: ; VI: ; %bb.0: @@ -4784,23 +4761,23 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16: @@ -4936,23 +4913,23 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 3c27adde10b78..48f6c96df139d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -315,6 +315,8 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -413,6 +415,8 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -624,15 +628,15 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 @@ -733,13 +737,13 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -790,15 +794,15 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 @@ -899,13 +903,13 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX7-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -956,20 +960,20 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s4, s17, 16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX7-NEXT: s_lshr_b32 s4, s16, 16 +; GFX7-NEXT: s_lshr_b32 s5, s17, 16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, s17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, s16 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, s17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v2, v1, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v1, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1107,28 +1111,28 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1241,21 +1245,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,28 +1311,28 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1441,21 +1445,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX7-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1507,38 +1511,38 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v8, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc -; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v6, v7 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1672,29 +1676,29 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1748,38 +1752,38 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v8, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc -; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v6, v7 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1913,29 +1917,29 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX7-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,71 +1993,71 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX7-LABEL: v_maximum_v8f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v15, v10, v11 +; GFX7-NEXT: v_mov_b32_e32 v16, 0x7fc00000 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v10, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v15, vcc +; GFX7-NEXT: v_max_f32_e32 v17, v9, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v16, v15, v14 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v16, vcc -; GFX7-NEXT: v_max_f32_e32 v15, v13, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_max_f32_e32 v15, v13, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v15, vcc -; GFX7-NEXT: v_max_f32_e32 v13, v11, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v15, vcc +; GFX7-NEXT: v_max_f32_e32 v13, v8, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v8, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; GFX7-NEXT: v_max_f32_e32 v11, v9, v8 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v11, vcc -; GFX7-NEXT: v_max_f32_e32 v9, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v13, vcc +; GFX7-NEXT: v_max_f32_e32 v12, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v12, vcc ; GFX7-NEXT: v_max_f32_e32 v7, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v7, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v5, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v12 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f16: @@ -2268,134 +2272,134 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v16 -; GFX7-NEXT: v_max_f32_e32 v16, v18, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v18, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; GFX7-NEXT: v_max_f32_e32 v17, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v20, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v16 +; GFX7-NEXT: v_max_f32_e32 v16, v19, v18 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v19, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX7-NEXT: v_max_f32_e32 v18, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v21, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_max_f32_e32 v23, v19, v18 -; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v19, v18 -; GFX7-NEXT: v_max_f32_e32 v18, v22, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v22, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_max_f32_e32 v23, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v20, v19 +; GFX7-NEXT: v_max_f32_e32 v19, v22, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v22, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_max_f32_e32 v22, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v20, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_f32_e32 v22, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v21, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v6, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_max_f32_e32 v24, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v20, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v25, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v20, v19 -; GFX7-NEXT: v_max_f32_e32 v19, v6, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GFX7-NEXT: v_mov_b32_e32 v21, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v24, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v21, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_max_f32_e32 v25, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v21, v20 +; GFX7-NEXT: v_max_f32_e32 v20, v6, v14 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_max_f32_e32 v13, v5, v6 ; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v5, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v3, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_f32_e32 v14, v6, v5 ; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_f32_e32 v12, v3, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v21, v16, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v6, v5 -; GFX7-NEXT: v_max_f32_e32 v11, v4, v7 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v6, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v7 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v21, v3, s[24:25] -; GFX7-NEXT: v_max_f32_e32 v3, v2, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v21, v11, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v12, v3, v6 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v3, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v11 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v7, v11 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX7-NEXT: v_max_f32_e32 v5, v1, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v21, v3, vcc -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX7-NEXT: v_max_f32_e32 v7, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v21, v24, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v21, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v21, v17, v24, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v7, vcc ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v21, v17, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v21, v18, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v18, v21, v22, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v21, v7, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v20 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v18, s[10:11] +; GFX7-NEXT: v_max_f32_e32 v15, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v18, v17, v22, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v21 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v15, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v18 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v21, v23, s[6:7] -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v17 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[22:23] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v21, v14, s[20:21] -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v21, v13, s[18:19] -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v13 +; GFX7-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[6:7] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v22, v21, v25, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[16:17] -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v15 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v19 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v22 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[24:25] +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v19 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v23, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v22, v17, v25, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v20, v17, v20, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[20:21] +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v12 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v22 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v13 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index de24617e058dd..69f17ed072425 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -442,14 +442,14 @@ define amdgpu_kernel void @maxnum_v2f16( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v1, v2, v3 +; SI-NEXT: v_max_f32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -731,24 +731,24 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: s_lshr_b32 s9, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_max_f32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_max_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_max_f32_e32 v2, v3, v4 -; SI-NEXT: v_max_f32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v0, v3, v0 +; SI-NEXT: v_max_f32_e32 v2, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v3f16: @@ -859,39 +859,39 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_max_f32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s6, 16 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 +; SI-NEXT: v_max_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1009,24 +1009,24 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v1, 0x41000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index e79324d7655fc..0e91d905d5585 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -473,14 +473,14 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v1, v2, v3 +; SI-NEXT: v_min_f32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -576,13 +576,13 @@ entry: define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: minnum_v2f16_no_ieee: ; SI: ; %bb.0: -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -796,24 +796,24 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: s_lshr_b32 s9, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_min_f32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_min_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_min_f32_e32 v2, v3, v4 -; SI-NEXT: v_min_f32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v0, v3, v0 +; SI-NEXT: v_min_f32_e32 v2, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v3f16: @@ -923,39 +923,39 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_min_f32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_min_f32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s6, 16 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 +; SI-NEXT: v_min_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1072,24 +1072,24 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v1, 0x41000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index b7fc76aecf080..2989ff02c9e8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -143,19 +143,19 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 -; GFX6-NEXT: v_fract_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 +; GFX6-NEXT: v_fract_f32_e32 v1, v1 +; GFX6-NEXT: v_sin_f32_e32 v1, v1 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 -; GFX6-NEXT: v_sin_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 9778c61c44e6e..90bf849483196 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6017,97 +6017,77 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fadd_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6252,98 +6232,78 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6482,41 +6442,32 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fadd_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6526,41 +6477,32 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fadd_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6702,41 +6644,32 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6746,42 +6679,33 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 91add012bdcfa..a71938582da52 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -5687,97 +5687,77 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmax_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5970,98 +5950,78 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6247,41 +6207,32 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmax_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6291,41 +6242,32 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmax_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6514,41 +6456,32 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6558,42 +6491,33 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 8597c2e256584..19a0d8bd717f7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -5687,97 +5687,77 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmin_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5970,98 +5950,78 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6247,41 +6207,32 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmin_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6291,41 +6242,32 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmin_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6514,41 +6456,32 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6558,42 +6491,33 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 290d3117cac9a..e560215e4c066 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -6510,97 +6510,77 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6776,98 +6756,78 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -7034,41 +6994,32 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7078,41 +7029,32 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fsub_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7282,41 +7224,32 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7326,42 +7259,33 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 7dc9304d5715b..7044afb09e371 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -38,9 +38,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -104,9 +104,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -179,9 +179,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_and_b32_e32 v1, 0xffff, v3 @@ -255,9 +255,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -337,9 +337,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -434,9 +434,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -498,12 +498,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -581,16 +585,20 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-CI-NEXT: s_mov_b32 s6, -1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 87d33c1c063eb..154d6c7079672 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -136,9 +136,9 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -197,9 +197,9 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1, ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -252,8 +252,8 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -304,11 +304,15 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -378,8 +382,8 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -469,12 +473,12 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 @@ -601,15 +605,15 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 @@ -777,36 +781,36 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-CI-LABEL: v_mad_mix_v4f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v10, v8 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v11, v9 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v9 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v4 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32: @@ -971,22 +975,30 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: @@ -1140,29 +1152,41 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1357,42 +1381,58 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v10, v8 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v11, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v2, 0, v2 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1560,22 +1600,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_mac_f32_e32 v3, v0, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1727,18 +1771,22 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -1906,12 +1954,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v3, v5, v4, v3 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -2078,15 +2126,15 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 @@ -2284,35 +2332,35 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp -; SDAG-CI-NEXT: v_mad_f32 v9, v11, v10, v9 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_mad_f32 v6, v10, v8, v6 clamp +; SDAG-CI-NEXT: v_mad_f32 v7, v11, v9, v7 clamp +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v7 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v9 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG-CI-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index ee250fc74c7ae..fcd9dae983cfb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -53,24 +53,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2 ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -185,11 +175,10 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: @@ -271,16 +260,14 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v6, v7 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32: @@ -386,17 +373,16 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, ; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v4, v1, v2 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v3 -; SDAG-CI-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SDAG-CI-NEXT: v_mad_f32 v0, v4, v0, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: @@ -463,9 +449,9 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -543,11 +529,11 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v0, v3 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: @@ -606,8 +592,8 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half ; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -664,21 +650,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -720,21 +698,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr ; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.neg = fneg float %src2 @@ -777,21 +747,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr ; VI-NEXT: v_mad_f32 v0, v0, v1, |v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -834,21 +796,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float ; VI-NEXT: v_mad_f32 v0, v0, v1, -|v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -900,13 +854,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; VI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; GISEL-GFX1100: ; %bb.0: @@ -929,14 +883,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) @@ -985,8 +931,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1074,8 +1020,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1177,8 +1123,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,10 +1239,10 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; SDAG-CI-NEXT: v_mad_f32 v1, v3, v2, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -1427,10 +1373,10 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e230000 ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 @@ -1564,10 +1510,10 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e22f983 ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 @@ -1686,9 +1632,9 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,23 +1787,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -1903,21 +1840,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -1987,15 +1916,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-GFX1100: ; %bb.0: @@ -2007,16 +1936,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 ; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -2082,14 +2001,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-GFX1100: ; %bb.0: @@ -2100,15 +2019,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 ; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul float %src0.ext, %src1.ext @@ -2153,24 +2063,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -2214,21 +2114,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul contract float %src0.ext, %src1.ext @@ -2276,9 +2168,9 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1 ; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2377,11 +2269,11 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: @@ -2450,25 +2342,15 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1 ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 %src0.abs = call half @llvm.fabs.f16(half %src0) @@ -2519,11 +2401,11 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, -v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: @@ -2606,11 +2488,11 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: @@ -2693,11 +2575,11 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, -|v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -|v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 086c78fd041fc..806d941ac8730 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -27,14 +27,23 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_maximumnum_f16(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -905,14 +914,23 @@ define double @v_maximumnum_f64_1.0(double %x) { } define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_s_v: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_s_v: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_s_v: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_v: ; GFX8-SDAG: ; %bb.0: @@ -1070,14 +1088,23 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { } define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-LABEL: v_maximumnum_f16_v_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_v_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_v_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1235,14 +1262,23 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { } define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-LABEL: v_maximumnum_f16_s_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_s_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, s17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_s_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2563,14 +2599,23 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) { } define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_fabs_rhs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2716,9 +2761,8 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2874,14 +2918,23 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_maximumnum_f16_fabs(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_fabs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -3028,10 +3081,8 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3306,10 +3357,10 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 @@ -3460,13 +3511,13 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3544,19 +3595,19 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3730,21 +3781,21 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3833,29 +3884,29 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4057,29 +4108,29 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,39 +4226,39 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v10, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4465,54 +4516,54 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v14, v12 +; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v13 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v15 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v8f16: @@ -4825,101 +4876,101 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v15 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v17, v18 -; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v16, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v20, v21 -; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v19, v18 +; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v21, v20 +; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v21, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 -; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v24, v23 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5453,17 +5504,23 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -5472,189 +5529,185 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-SDAG-NEXT: v_max_f32_e32 v32, v33, v34 -; GFX7-SDAG-NEXT: v_max_f32_e32 v33, v35, v36 -; GFX7-SDAG-NEXT: v_max_f32_e32 v35, v37, v38 -; GFX7-SDAG-NEXT: v_max_f32_e32 v37, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX7-SDAG-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX7-SDAG-NEXT: v_max_f32_e32 v32, v36, v35 +; GFX7-SDAG-NEXT: v_max_f32_e32 v34, v38, v37 +; GFX7-SDAG-NEXT: v_max_f32_e32 v36, v48, v39 +; GFX7-SDAG-NEXT: v_max_f32_e32 v37, v51, v50 +; GFX7-SDAG-NEXT: v_max_f32_e32 v38, v53, v52 +; GFX7-SDAG-NEXT: v_max_f32_e32 v39, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v36, v39, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_max_f32_e32 v38, v52, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v39, v54, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_max_f32_e32 v52, v40, v41 -; GFX7-SDAG-NEXT: v_max_f32_e32 v50, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_max_f32_e32 v50, v51, v50 +; GFX7-SDAG-NEXT: v_max_f32_e32 v51, v53, v52 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v52, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v53 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GFX7-SDAG-NEXT: v_max_f32_e32 v49, v43, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_max_f32_e32 v51, v51, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v55 -; GFX7-SDAG-NEXT: v_max_f32_e32 v54, v40, v41 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v34, v42, v34 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GFX7-SDAG-NEXT: v_max_f32_e32 v48, v41, v40 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v15, v48 -; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v40, v44, v43 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v35, v42, v35 +; GFX7-SDAG-NEXT: v_max_f32_e32 v54, v41, v55 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v40 +; GFX7-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v55, v42, v43 -; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v54 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v48 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v49 +; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v35 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v33 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v14, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8536,13 +8589,13 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8634,13 +8687,13 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8716,21 +8769,21 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8819,29 +8872,29 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 0311caf93a14e..8c98931b02933 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -27,14 +27,23 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_minimumnum_f16(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -905,14 +914,23 @@ define double @v_minimumnum_f64_1.0(double %x) { } define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-LABEL: v_minimumnum_f16_v_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_v_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_v_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1070,14 +1088,23 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { } define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-LABEL: v_minimumnum_f16_s_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_s_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, s17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_s_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2398,14 +2425,23 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) { } define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16_fabs_rhs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2551,9 +2587,8 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2709,14 +2744,23 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_minimumnum_f16_fabs(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16_fabs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -2863,10 +2907,8 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3141,10 +3183,10 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 @@ -3295,13 +3337,13 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3379,19 +3421,19 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3565,21 +3607,21 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3668,29 +3710,29 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3892,29 +3934,29 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4010,39 +4052,39 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v10, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4300,54 +4342,54 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v14, v12 +; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v13 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v15 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v8f16: @@ -4660,101 +4702,101 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v15 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v17, v18 -; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v16, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v20, v21 -; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v19, v18 +; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v21, v20 +; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v21, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 -; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v24, v23 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5288,17 +5330,23 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -5307,189 +5355,185 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-SDAG-NEXT: v_min_f32_e32 v32, v33, v34 -; GFX7-SDAG-NEXT: v_min_f32_e32 v33, v35, v36 -; GFX7-SDAG-NEXT: v_min_f32_e32 v35, v37, v38 -; GFX7-SDAG-NEXT: v_min_f32_e32 v37, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX7-SDAG-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX7-SDAG-NEXT: v_min_f32_e32 v32, v36, v35 +; GFX7-SDAG-NEXT: v_min_f32_e32 v34, v38, v37 +; GFX7-SDAG-NEXT: v_min_f32_e32 v36, v48, v39 +; GFX7-SDAG-NEXT: v_min_f32_e32 v37, v51, v50 +; GFX7-SDAG-NEXT: v_min_f32_e32 v38, v53, v52 +; GFX7-SDAG-NEXT: v_min_f32_e32 v39, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v36, v39, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_min_f32_e32 v38, v52, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v39, v54, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_min_f32_e32 v52, v40, v41 -; GFX7-SDAG-NEXT: v_min_f32_e32 v50, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_min_f32_e32 v50, v51, v50 +; GFX7-SDAG-NEXT: v_min_f32_e32 v51, v53, v52 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v52, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v53 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GFX7-SDAG-NEXT: v_min_f32_e32 v49, v43, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_min_f32_e32 v51, v51, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v55 -; GFX7-SDAG-NEXT: v_min_f32_e32 v54, v40, v41 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v34, v42, v34 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GFX7-SDAG-NEXT: v_min_f32_e32 v48, v41, v40 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v15, v48 -; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v40, v44, v43 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v35, v42, v35 +; GFX7-SDAG-NEXT: v_min_f32_e32 v54, v41, v55 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v40 +; GFX7-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v55, v42, v43 -; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v54 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v48 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v49 +; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v35 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v33 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v14, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8371,13 +8415,13 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8469,13 +8513,13 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8551,21 +8595,21 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8654,29 +8698,29 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index 90632c663bf4a..7ed68dd6a00fe 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -1086,7 +1086,9 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1142,7 +1144,9 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 mul:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1197,7 +1201,9 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index ff3a735bd32b4..2d3524d711788 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -251,15 +251,17 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 { ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 +; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -533,15 +535,17 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX6-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX6-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v5 ; GFX6-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX6-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v1 @@ -784,41 +788,45 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x ; GFX6-LABEL: v_repeat_divisor_v2f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v7, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX6-NEXT: v_fma_f32 v7, v8, v7, v7 -; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v4, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v9, v8, v7 -; GFX6-NEXT: v_fma_f32 v10, -v6, v9, v8 -; GFX6-NEXT: v_fma_f32 v9, v10, v7, v9 -; GFX6-NEXT: v_fma_f32 v6, -v6, v9, v8 -; GFX6-NEXT: v_div_fmas_f32 v6, v6, v7, v9 +; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v3, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v8, v7, v5 +; GFX6-NEXT: v_fma_f32 v9, -v4, v8, v7 +; GFX6-NEXT: v_fma_f32 v8, v9, v5, v8 +; GFX6-NEXT: v_fma_f32 v4, -v4, v8, v7 ; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v2, v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v8, v7 -; GFX6-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v9, v7 +; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v8 +; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 +; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 +; GFX6-NEXT: v_fma_f32 v4, -v7, v9, 1.0 +; GFX6-NEXT: v_fma_f32 v4, v4, v9, v9 +; GFX6-NEXT: v_mul_f32_e32 v8, v5, v4 +; GFX6-NEXT: v_fma_f32 v9, -v7, v8, v5 +; GFX6-NEXT: v_fma_f32 v8, v9, v4, v8 +; GFX6-NEXT: v_fma_f32 v5, -v7, v8, v5 +; GFX6-NEXT: v_div_fmas_f32 v4, v5, v4, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_div_fixup_f32 v2, v4, v2, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_fma_f32 v6, -v7, v8, 1.0 -; GFX6-NEXT: v_fma_f32 v6, v6, v8, v8 -; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v9, v8, v6 -; GFX6-NEXT: v_fma_f32 v10, -v7, v9, v8 -; GFX6-NEXT: v_fma_f32 v9, v10, v6, v9 -; GFX6-NEXT: v_fma_f32 v7, -v7, v9, v8 -; GFX6-NEXT: v_div_fmas_f32 v6, v7, v6, v9 -; GFX6-NEXT: v_div_fixup_f32 v2, v6, v2, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v5, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 @@ -871,69 +879,75 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x ; GFX6-LABEL: v_repeat_divisor_v3f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX6-NEXT: v_div_scale_f32 v9, s[4:5], v4, v4, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v10, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v6, v6, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v8, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX6-NEXT: v_fma_f32 v8, v9, v8, v8 +; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v10, v9, v8 +; GFX6-NEXT: v_fma_f32 v11, -v7, v10, v9 +; GFX6-NEXT: v_fma_f32 v10, v11, v8, v10 +; GFX6-NEXT: v_fma_f32 v7, -v7, v10, v9 +; GFX6-NEXT: v_div_fmas_f32 v7, v7, v8, v10 +; GFX6-NEXT: v_div_scale_f32 v8, s[4:5], v4, v4, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v9, v8 +; GFX6-NEXT: v_div_fixup_f32 v6, v7, v6, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX6-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX6-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX6-NEXT: v_div_scale_f32 v11, vcc, 1.0, v4, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v12, v11, v10 -; GFX6-NEXT: v_fma_f32 v13, -v9, v12, v11 -; GFX6-NEXT: v_fma_f32 v12, v13, v10, v12 -; GFX6-NEXT: v_fma_f32 v9, -v9, v12, v11 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v13, v11 -; GFX6-NEXT: v_div_fmas_f32 v9, v9, v10, v12 -; GFX6-NEXT: v_div_fixup_f32 v4, v9, v4, 1.0 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 -; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 -; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 -; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 -; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 -; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 -; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v5, v5, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v13, v11 -; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_div_fixup_f32 v8, v9, v8, 1.0 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v5, 1.0 -; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 -; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 -; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 -; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 -; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 -; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 -; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_div_fixup_f32 v5, v9, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX6-NEXT: v_fma_f32 v9, v10, v9, v9 +; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v11, v10, v9 +; GFX6-NEXT: v_fma_f32 v12, -v8, v11, v10 +; GFX6-NEXT: v_fma_f32 v11, v12, v9, v11 +; GFX6-NEXT: v_fma_f32 v8, -v8, v11, v10 +; GFX6-NEXT: v_div_scale_f32 v10, s[4:5], v5, v5, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v12, v10 +; GFX6-NEXT: v_div_fmas_f32 v8, v8, v9, v11 +; GFX6-NEXT: v_div_fixup_f32 v4, v8, v4, 1.0 +; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v8, -v10, v12, 1.0 +; GFX6-NEXT: v_fma_f32 v8, v8, v12, v12 +; GFX6-NEXT: v_mul_f32_e32 v11, v9, v8 +; GFX6-NEXT: v_fma_f32 v12, -v10, v11, v9 +; GFX6-NEXT: v_fma_f32 v11, v12, v8, v11 +; GFX6-NEXT: v_fma_f32 v9, -v10, v11, v9 +; GFX6-NEXT: v_div_fmas_f32 v8, v9, v8, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_div_fixup_f32 v5, v8, v5, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v7, v7, v6 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX6-NEXT: v_mul_f32_e32 v7, v7, v8 -; GFX6-NEXT: v_mul_f32_e32 v4, v6, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v6, v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_v3f16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 99d494d4feaf4..8920bfbd3b9dc 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -821,46 +821,46 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; SDAG_GFX6-LABEL: v_roundeven_v4f16: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v3, v3 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG_GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_v4f16: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v3, v3 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG_GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 6d4b1c4621054..b2317cd653842 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -13,12 +13,11 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -77,15 +76,15 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v4 -; CI-NEXT: v_add_f32_e64 v1, |v1|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -152,14 +151,13 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e64 v1, |v1| ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: @@ -225,15 +223,15 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 -; CI-NEXT: v_add_f32_e64 v1, |v2|, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v1, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -300,12 +298,12 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -368,11 +366,13 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -437,9 +437,12 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fabs_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -504,9 +507,12 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_posk_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; CI-NEXT: v_mov_b32_e32 v3, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -570,11 +576,13 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -638,12 +646,13 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negliteralk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffe400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -707,11 +716,12 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -770,11 +780,12 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -833,12 +844,11 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -897,15 +907,15 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v0, v2, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -972,15 +982,13 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-NEXT: v_sub_f32_e32 v0, v2, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: @@ -1046,15 +1054,15 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 -; CI-NEXT: v_sub_f32_e32 v1, v4, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v1, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,13 +1129,12 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,11 +1197,12 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1253,12 +1261,12 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_inv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffb118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1317,12 +1325,12 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_neginv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; CI-NEXT: v_mov_b32_e32 v3, 0x3118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1381,9 +1389,12 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1447,11 +1458,12 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negliteralk_negliteralk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mov_b32_e32 v2, 0xc5800000 -; CI-NEXT: v_mov_b32_e32 v3, 0xc5000000 +; CI-NEXT: v_mov_b32_e32 v2, 0xec00 +; CI-NEXT: v_mov_b32_e32 v3, 0xe800 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1515,9 +1527,12 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fneg_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1582,11 +1597,12 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1645,11 +1661,12 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1708,11 +1725,12 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1771,13 +1789,13 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1847,13 +1865,13 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, -|v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v2, 0x8000, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1923,13 +1941,13 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1998,13 +2016,13 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2073,12 +2091,12 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2143,12 +2161,12 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2213,12 +2231,13 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0x4400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,12 +2302,13 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_posk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0x4400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2353,12 +2373,13 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffc400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2423,12 +2444,13 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffc400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2498,10 +2520,12 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, 4.0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_f16: @@ -2538,10 +2562,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, -4.0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_add_f16: @@ -2582,10 +2607,12 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16: @@ -2622,10 +2649,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16: @@ -2666,10 +2694,11 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_mul_f16: @@ -2728,11 +2757,57 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; CI-SAFE-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; CI-SAFE-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; CI-SAFE-NEXT: v_fma_f64 v[1:2], v[3:4], 4.0, v[1:2] +; CI-SAFE-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-SAFE-NEXT: v_and_b32_e32 v3, 0xffe, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CI-SAFE-NEXT: v_bfe_u32 v4, v2, 20, 11 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; CI-SAFE-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; CI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; CI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; CI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; CI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; CI-SAFE-NEXT: v_or_b32_e32 v3, v6, v3 +; CI-SAFE-NEXT: v_or_b32_e32 v5, v1, v5 +; CI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-SAFE-NEXT: v_and_b32_e32 v5, 7, v3 +; CI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; CI-SAFE-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; CI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-SAFE-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_fma_f32 v1, v1, 4.0, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fma_f16: @@ -2768,12 +2843,57 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; CI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; CI-NSZ-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; CI-NSZ-NEXT: v_fma_f64 v[1:2], v[3:4], -4.0, v[1:2] +; CI-NSZ-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NSZ-NEXT: v_and_b32_e32 v3, 0xffe, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CI-NSZ-NEXT: v_bfe_u32 v4, v2, 20, 11 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-NSZ-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; CI-NSZ-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; CI-NSZ-NEXT: v_med3_i32 v5, v5, 0, 13 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; CI-NSZ-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; CI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CI-NSZ-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; CI-NSZ-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; CI-NSZ-NEXT: v_or_b32_e32 v3, v6, v3 +; CI-NSZ-NEXT: v_or_b32_e32 v5, v1, v5 +; CI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NSZ-NEXT: v_and_b32_e32 v5, 7, v3 +; CI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-NSZ-NEXT: v_or_b32_e32 v5, v5, v6 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; CI-NSZ-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7c00 +; CI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7e00 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NSZ-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_fma_f32 v1, v1, -4.0, -v2 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_f16: @@ -2817,9 +2937,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-SAFE-NEXT: v_add_f32_e32 v1, v1, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_f16: @@ -2859,9 +2983,12 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v1 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v2 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index b0e920478e3a5..c026a42993d48 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -9,22 +9,21 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -101,26 +100,28 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| -; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v9, v2, 16, 15 +; CI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: v_add_f32_e32 v3, v8, v6 +; CI-NEXT: v_add_f32_e32 v3, v5, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, v2, v4 @@ -213,26 +214,25 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_bfe_u32 v7, v2, 16, 15 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_mov_b32_e32 v1, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: @@ -310,26 +310,28 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| -; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_bfe_u32 v9, v3, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v2, v9, v6 +; CI-NEXT: v_add_f32_e32 v2, v4, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v3, v3, v5 @@ -422,22 +424,20 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_var_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -508,17 +508,19 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -593,13 +595,17 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fabs_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 @@ -672,13 +678,17 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_posk_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; CI-NEXT: v_mov_b32_e32 v5, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -750,17 +760,19 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_negk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -834,18 +846,19 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; CI-LABEL: add_select_negliteralk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffe400 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0xc4800000 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -919,17 +932,19 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1003,17 +1018,19 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_posk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1087,22 +1104,19 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1169,26 +1183,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v7 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v3, v6, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_sub_f32_e32 v2, v5, v2 @@ -1269,20 +1283,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1358,26 +1369,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v8 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v2, v6, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v2, v4, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_sub_f32_e32 v3, v5, v3 @@ -1459,22 +1470,19 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1546,16 +1554,17 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1625,17 +1634,17 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0xbe230000 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffb118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1705,17 +1714,17 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0x3e230000 +; CI-NEXT: v_mov_b32_e32 v6, 0x3118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1784,13 +1793,17 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -1862,15 +1875,17 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; CI-LABEL: add_select_negliteralk_negliteralk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v4, 0xc5800000 -; CI-NEXT: v_mov_b32_e32 v5, 0xc5000000 +; CI-NEXT: v_mov_b32_e32 v4, 0xec00 +; CI-NEXT: v_mov_b32_e32 v5, 0xe800 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -1942,13 +1957,17 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fneg_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 @@ -2022,16 +2041,17 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2101,16 +2121,17 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2180,16 +2201,17 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2258,23 +2280,21 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2352,23 +2372,21 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_fabs_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2446,23 +2464,21 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_neg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2539,23 +2555,21 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2632,22 +2646,20 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_neg_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2720,22 +2732,20 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2809,17 +2819,18 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x4400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2895,17 +2906,18 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x4400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2981,17 +2993,18 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffc400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3067,17 +3080,18 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffc400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3159,7 +3173,8 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 @@ -3168,13 +3183,10 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3248,16 +3260,17 @@ define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3327,7 +3340,8 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_add_f32_e32 v3, -4.0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, -4.0, v2 @@ -3336,13 +3350,10 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3416,16 +3427,17 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 -; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3493,16 +3505,17 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3569,28 +3582,110 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-LABEL: select_fneg_posk_src_fma_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; CI-NEXT: s_movk_i32 s4, 0x3f1 +; CI-NEXT: s_movk_i32 s5, 0xfc10 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_fma_f32 v4, v5, 4.0, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v2, v2, 4.0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], 4.0, v[4:5] +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_and_b32_e32 v6, 0x1ff, v5 +; CI-NEXT: v_or_b32_e32 v4, v6, v4 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 8, v5 +; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; CI-NEXT: v_bfe_u32 v7, v5, 20, 11 +; CI-NEXT: v_or_b32_e32 v4, v6, v4 +; CI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 +; CI-NEXT: v_or_b32_e32 v6, 0x1000, v4 +; CI-NEXT: v_med3_i32 v8, v8, 0, 13 +; CI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 +; CI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 +; CI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 +; CI-NEXT: v_or_b32_e32 v6, v9, v6 +; CI-NEXT: v_or_b32_e32 v8, v4, v8 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; CI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CI-NEXT: v_and_b32_e32 v8, 7, v6 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; CI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; CI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CI-NEXT: v_or_b32_e32 v8, v8, v9 +; CI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CI-NEXT: v_mov_b32_e32 v8, 0x7c00 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; CI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CI-NEXT: v_mov_b32_e32 v9, 0x7e00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CI-NEXT: s_movk_i32 s6, 0x40f +; CI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; CI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v10 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 +; CI-NEXT: v_fma_f64 v[2:3], v[6:7], 4.0, v[2:3] +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_and_b32_e32 v5, 0x1ff, v3 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CI-NEXT: v_and_b32_e32 v5, 0xffe, v5 +; CI-NEXT: v_bfe_u32 v6, v3, 20, 11 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 +; CI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; CI-NEXT: v_or_b32_e32 v5, 0x1000, v2 +; CI-NEXT: v_med3_i32 v7, v7, 0, 13 +; CI-NEXT: v_lshrrev_b32_e32 v10, v7, v5 +; CI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 +; CI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 +; CI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; CI-NEXT: v_or_b32_e32 v5, v10, v5 +; CI-NEXT: v_or_b32_e32 v7, v2, v7 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; CI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; CI-NEXT: v_and_b32_e32 v7, 7, v5 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; CI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; CI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CI-NEXT: v_or_b32_e32 v7, v7, v10 +; CI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; CI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v4, v2 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3665,30 +3760,32 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; CI-LABEL: select_fneg_posk_src_fmad_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mul_f32_e32 v5, 4.0, v5 -; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v4, 4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v4, v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_add_f32_e32 v2, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3763,22 +3860,27 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> ; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5 -; CI-NEXT: v_sub_f32_e32 v4, v5, v4 ; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v2, v2, v3 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v4, vcc +; CI-NEXT: v_mul_f32_e32 v4, -4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_sub_f32_e32 v2, v2, v3 +; CI-NEXT: v_sub_f32_e32 v4, v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index c7422a25f71e7..59c0f1cc7782f 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -521,10 +521,10 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe: @@ -566,10 +566,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: @@ -611,10 +611,10 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: @@ -696,10 +696,10 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe: @@ -741,10 +741,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: @@ -786,10 +786,10 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: @@ -873,16 +873,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe: @@ -944,16 +945,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: @@ -1015,16 +1017,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: @@ -1084,13 +1087,13 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1124,16 +1127,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe: @@ -1195,16 +1199,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: @@ -1266,16 +1271,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: @@ -1335,13 +1341,13 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1373,30 +1379,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe: @@ -1475,30 +1483,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: @@ -1577,30 +1587,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: @@ -1679,29 +1691,29 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1731,30 +1743,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe: @@ -1833,30 +1847,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: @@ -1935,30 +1951,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: @@ -2037,29 +2055,29 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll index ba04cdb795ce3..563e95f7f55b5 100644 --- a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll @@ -15,14 +15,12 @@ define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ; %bb.1: ; %bb -; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4 -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s4, s6 -; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2 -; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4 -; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; CHECK-NEXT: s_mov_b32 s11, 0xf000 +; CHECK-NEXT: s_mov_b32 s10, 0 +; CHECK-NEXT: s_mov_b32 s8, s10 +; CHECK-NEXT: s_mov_b32 s9, s10 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: buffer_store_short v2, v[0:1], s[8:11], 0 addr64 offset:2 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -48,17 +46,13 @@ define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) # ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cvt_f32_f16_e64 v0, v0 ; CHECK-NEXT: ; %bb.1: ; %bb -; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 ; CHECK-NEXT: s_mov_b32 s7, 0xf000 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 -; CHECK-NEXT: v_cvt_f16_f32_e64 v0, s4 -; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 52cb3935b9a01..195d222408139 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -10,39 +10,36 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s12 ; SI-NEXT: s_mov_b32 s21, s13 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -185,6 +182,8 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 @@ -193,8 +192,6 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc @@ -202,11 +199,8 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -329,6 +323,8 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 @@ -337,8 +333,6 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc @@ -346,11 +340,8 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -477,24 +468,23 @@ define amdgpu_kernel void @select_f16_imm_c( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -620,24 +610,23 @@ define amdgpu_kernel void @select_f16_imm_d( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -752,54 +741,47 @@ define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s20, s12 ; SI-NEXT: s_mov_b32 s21, s13 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -960,11 +942,11 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 @@ -975,25 +957,17 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1135,11 +1109,11 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 @@ -1150,25 +1124,17 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1315,35 +1281,33 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1489,34 +1453,32 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -1679,34 +1641,17 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; SI-LABEL: v_vselect_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v4f16: @@ -1807,62 +1752,27 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-LABEL: v_vselect_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; SI-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5] -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; SI-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; SI-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v8f16: @@ -2010,120 +1920,49 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cndmask_b32_e32 v29, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cndmask_b32_e32 v27, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cndmask_b32_e32 v25, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cndmask_b32_e32 v23, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cndmask_b32_e32 v21, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cndmask_b32_e32 v19, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cndmask_b32_e32 v17, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 -; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v29 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cndmask_b32_e64 v26, v14, v6, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[4:5] +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v27 +; SI-NEXT: v_cndmask_b32_e64 v14, v13, v5, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; SI-NEXT: v_cndmask_b32_e32 v25, v12, v4, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; SI-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; SI-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SI-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v9 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v10 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v11 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v12 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v25 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v14 +; SI-NEXT: v_bfi_b32 v6, s4, v6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; SI-NEXT: v_bfi_b32 v7, s4, v8, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v16f16: @@ -2388,324 +2227,146 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; SI-LABEL: v_vselect_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 -; SI-NEXT: v_cndmask_b32_e32 v33, v42, v41, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 -; SI-NEXT: v_cndmask_b32_e32 v35, v45, v43, vcc -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 -; SI-NEXT: v_cndmask_b32_e32 v36, v47, v46, vcc -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cndmask_b32_e32 v37, v57, v56, vcc -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; SI-NEXT: v_cndmask_b32_e32 v38, v34, v58, vcc +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 -; SI-NEXT: v_cndmask_b32_e32 v34, v41, v40, vcc -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v37 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cndmask_b32_e64 v37, v38, v15, s[14:15] +; SI-NEXT: v_cndmask_b32_e64 v15, v38, v15, s[12:13] +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cndmask_b32_e32 v39, v40, v39, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v39 +; SI-NEXT: v_cndmask_b32_e64 v39, v30, v14, s[12:13] +; SI-NEXT: v_cndmask_b32_e64 v14, v30, v14, s[10:11] +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v48 +; SI-NEXT: v_cndmask_b32_e64 v48, v29, v13, s[10:11] ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_cndmask_b32_e32 v50, v42, v41, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v49 +; SI-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[10:11] ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cndmask_b32_e32 v51, v40, v55, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v50 +; SI-NEXT: v_cndmask_b32_e64 v50, v28, v12, s[10:11] +; SI-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v51 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cndmask_b32_e64 v28, v27, v11, s[8:9] ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_cndmask_b32_e32 v52, v42, v41, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v32 +; SI-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 -; SI-NEXT: v_cndmask_b32_e32 v53, v42, v41, vcc -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 -; SI-NEXT: v_cndmask_b32_e32 v54, v47, v43, vcc -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; SI-NEXT: v_cndmask_b32_e32 v49, v56, v47, vcc -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 -; SI-NEXT: v_cndmask_b32_e32 v48, v58, v57, vcc -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v33 +; SI-NEXT: v_cndmask_b32_e64 v27, v26, v10, s[8:9] +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 +; SI-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 -; SI-NEXT: v_cndmask_b32_e32 v44, v58, v56, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v34 +; SI-NEXT: v_cndmask_b32_e64 v34, v25, v9, s[8:9] +; SI-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[6:7] +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cndmask_b32_e32 v15, v58, v15, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cndmask_b32_e32 v14, v58, v14, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v35 +; SI-NEXT: v_cndmask_b32_e64 v35, v24, v8, s[6:7] +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v36 +; SI-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[6:7] +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 -; SI-NEXT: v_cndmask_b32_e32 v13, v28, v58, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v38 +; SI-NEXT: v_cndmask_b32_e64 v38, v23, v7, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 -; SI-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 -; SI-NEXT: v_cndmask_b32_e32 v11, v26, v28, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 -; SI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 -; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 -; SI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:8 +; SI-NEXT: v_cndmask_b32_e64 v23, v22, v6, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v60 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v6, s4, v6, v23 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v38 +; SI-NEXT: v_bfi_b32 v8, s4, v8, v35 +; SI-NEXT: v_bfi_b32 v9, s4, v9, v34 +; SI-NEXT: v_bfi_b32 v10, s4, v10, v27 +; SI-NEXT: v_bfi_b32 v11, s4, v11, v28 +; SI-NEXT: v_bfi_b32 v12, s4, v12, v50 +; SI-NEXT: v_bfi_b32 v13, s4, v13, v48 +; SI-NEXT: v_bfi_b32 v14, s4, v14, v39 +; SI-NEXT: v_bfi_b32 v15, s4, v15, v37 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cndmask_b32_e32 v26, v21, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 ; SI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 -; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 +; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 ; SI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 ; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 ; SI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v9, v9, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_or_b32_e32 v11, v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfi_b32 v0, s4, v0, v17 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v18 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v19 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v20 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v21 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 9a52b96bde709..80bf0b1336b01 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -10,6 +10,7 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44,9 +45,10 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: @@ -88,11 +90,12 @@ define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half ; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: @@ -197,6 +200,7 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -236,8 +240,9 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 @@ -287,10 +292,12 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal ; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 @@ -348,7 +355,9 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: @@ -387,8 +396,9 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: @@ -507,11 +517,12 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %p ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index 31c64046de11a..87b41815e36d5 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -11,9 +11,6 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: @@ -49,13 +46,8 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -107,19 +99,12 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo ; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -226,9 +211,7 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: @@ -268,9 +251,6 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: @@ -329,8 +309,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: @@ -368,15 +346,10 @@ define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x flo ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -438,8 +411,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: @@ -475,8 +446,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 9fe064c717972..ef2a06935f20a 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -17,6 +17,7 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; GFX7-LABEL: f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -30,13 +31,14 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v2f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v1 -; GFX7-NEXT: flat_store_dword v[1:2], v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v1 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") @@ -48,17 +50,19 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v3f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v6 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[2:3], v4 ; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") @@ -70,22 +74,24 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v4f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: flat_store_dword v[0:1], v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: flat_store_dword v[0:1], v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") @@ -98,9 +104,6 @@ define half @f16_return(float %arg) #0 { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %fptrunc @@ -112,13 +115,8 @@ define <2 x half> @v2f16_return(<2 x float> %arg) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -129,19 +127,12 @@ define <3 x half> @v3f16_return(<3 x float> %arg) #0 { ; GFX7-LABEL: v3f16_return: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -153,25 +144,15 @@ define <4 x half> @v4f16_return(<4 x float> %arg) #0 { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %fptrunc @@ -421,15 +402,10 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index ca93fcf3f55a2..c3a7e2ae4f344 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -2175,11 +2175,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index f5dc824aae35f..d987e7c65e692 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -252,8 +252,16 @@ bb: ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] ; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] -; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -; SI: v_madmk_f32 v{{[0-9]+}}, v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll index 8da6f2348690a..34cf771fae45e 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,16 +1,19 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s +; FIXME: Can the SI case form the mac through the casts? + ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] + +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 + ; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm @@ -32,8 +35,14 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_same_add: -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} @@ -67,8 +76,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32 + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -94,8 +107,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -121,8 +138,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_c: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -146,8 +167,16 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}} + +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm @@ -170,8 +199,15 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] + +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm @@ -194,8 +230,14 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -v{{[0-9]}} +; SI: v_add_f32_e32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -220,8 +262,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -247,8 +293,11 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -274,8 +323,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} @@ -299,30 +352,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16: -; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_add_f32 + +; VI: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] +; VI: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] +; VI: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] @@ -330,8 +369,8 @@ entry: ; VI-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] -; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] -; GCN: s_endpgm +; VI: {{buffer|flat}}_store_dword v[[R_V2_F16]] +; VI: s_endpgm define amdgpu_kernel void @mac_v2f16( ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -352,10 +391,14 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_same_add: -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_add_f32 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -390,11 +433,12 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} - -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -419,10 +463,15 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32_e32 ; VI-NOT: v_mac_f16 @@ -448,15 +497,15 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f32_f16 -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -481,11 +530,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -513,11 +567,14 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_add_f32 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -545,11 +602,12 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_add_f32 +; SI: v_add_f32 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -577,15 +635,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -610,15 +669,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -643,15 +703,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 3afe55fc93423..b675e0ffe9eed 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -11,22 +11,25 @@ define amdgpu_kernel void @madak_f16( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x41200000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -140,7 +143,6 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_mov_b32 s4, s10 @@ -148,12 +150,18 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000 -; SI-NEXT: v_mac_f32_e32 v3, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x41200000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x41200000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index deb140fa7e941..0ec4c18a070fe 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -20,12 +20,14 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -158,14 +160,18 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -311,17 +317,23 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -499,27 +511,41 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -787,47 +813,77 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll index 4c212daab39ee..44b8b8bcb9ae8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll @@ -182,30 +182,19 @@ entry: } define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v3half: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v3half: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_fmax_v3half: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -390,14 +379,19 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -618,22 +612,37 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v6 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v5 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -966,38 +975,73 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v12 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v11 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v5, v10 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v9 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3584,6 +3628,5 @@ declare double @llvm.vector.reduce.fmax.v16double(<16 x double>) ; GFX10: {{.*}} ; GFX11: {{.*}} ; GFX12: {{.*}} -; GFX7: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll index d198bb45654da..a20b5de786271 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll @@ -102,14 +102,16 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -217,21 +219,25 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -353,40 +359,52 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -580,76 +598,104 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v9, v0, v8 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_max_f32_e32 v16, v0, v15 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v15, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v9, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll index 479dc08a4f7aa..ed5c910def3d6 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll @@ -182,30 +182,19 @@ entry: } define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v3half: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v3half: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_fmin_v3half: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -390,14 +379,19 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -618,22 +612,37 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v6 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v5 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -966,38 +975,73 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v12 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v11 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v5, v10 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v9 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3583,6 +3627,5 @@ declare double @llvm.vector.reduce.fmin.v16double(<16 x double>) ; GFX10: {{.*}} ; GFX11: {{.*}} ; GFX12: {{.*}} -; GFX7: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll index 506d847c1144b..63e42e1e8a320 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll @@ -126,14 +126,16 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -266,21 +268,25 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -431,40 +437,52 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_min_f32_e32 v8, v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -700,76 +718,104 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v9, v0, v8 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_min_f32_e32 v16, v0, v15 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min_f32_e32 v15, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v9, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index 7ea92e7b3582c..57dc288bf6dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -20,12 +20,14 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -158,14 +160,18 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -311,17 +317,23 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -499,27 +511,41 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -787,47 +813,77 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; From c91ea578d5734f93d3e1727580b394779bdf4e4a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 Jan 2026 18:52:41 +0100 Subject: [PATCH 2/2] R600: Remove softPromoteHalfType Also includes a kind of hacky, minimal change to avoid assertions when softPromoteHalfType is removed to fix kernel arguments lowered as f16. Half support was never really implemented for r600, and there just happened to be a few incidental tests which included a half argument (which were also not even meaningful, since the function body just folded to nothing due to no callable function support). --- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 3 + llvm/lib/Target/AMDGPU/R600ISelLowering.h | 2 - llvm/test/CodeGen/AMDGPU/kernel-args.ll | 164 ++++++++++++++++++++ 3 files changed, 167 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 33a23ffb81926..5cd7a61d2c936 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1480,6 +1480,9 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } + if (VT.isInteger() && !MemVT.isInteger()) + MemVT = MemVT.changeTypeToInteger(); + if (AMDGPU::isShader(CallConv)) { Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index bb7fc46a98cbd..661efb8684813 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -117,8 +117,6 @@ class R600TargetLowering final : public AMDGPUTargetLowering { TargetLowering::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override; - - bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm; diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..0a53b3a906fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -6187,3 +6187,167 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 ret void } + +define amdgpu_kernel void @f16_arg(half %arg, ptr addrspace(1) %ptr) { +; SI-LABEL: f16_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: f16_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: f16_arg: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, +; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T1.W, PV.W, +; EG-NEXT: LSHL * T0.W, literal.x, PV.W, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: f16_arg: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) +; CM-NEXT: LSHL T0.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T0.W, literal.x, PV.W, +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: MOV T0.Y, 0.0, +; CM-NEXT: MOV * T0.Z, 0.0, +; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + store half %arg, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @v2f16_arg(<2 x half> %arg, ptr addrspace(1) %ptr) { +; SI-LABEL: v2f16_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v2f16_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2f16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: v2f16_arg: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHL * T0.W, T1.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, T0.X, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: v2f16_arg: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @10, KC0[], KC1[] +; CM-NEXT: TEX 1 @6 +; CM-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3 +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 11: +; CM-NEXT: LSHL * T0.W, T1.X, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT * T0.X, T0.X, PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + store <2 x half> %arg, ptr addrspace(1) %ptr + ret void +}